From 3e17314c22eaec164d2b01b7c41466054df64cb3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 14:17:19 +0200 Subject: agp: define proper stubs for empty helpers The empty unmap_page_from_agp() macro causes a warning when building with 'make W=1' on a couple of architectures: drivers/char/agp/generic.c: In function 'agp_generic_destroy_page': drivers/char/agp/generic.c:1265:28: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 1265 | unmap_page_from_agp(page); Change the definitions to a 'do { } while (0)' construct to make these more reliable. Signed-off-by: Arnd Bergmann Acked-by: Helge Deller # parisc Signed-off-by: Helge Deller --- arch/powerpc/include/asm/agp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h index b29b1186f819..6b6485c988dd 100644 --- a/arch/powerpc/include/asm/agp.h +++ b/arch/powerpc/include/asm/agp.h @@ -5,8 +5,8 @@ #include -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) +#define map_page_into_agp(page) do {} while (0) +#define unmap_page_from_agp(page) do {} while (0) #define flush_agp_cache() mb() /* GATT allocation. Returns/accepts GATT kernel virtual address. */ -- cgit v1.2.3 From 0c9dceb9bb6dadbf340f09c69e598d4729bbb86a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Jan 2022 13:59:47 -0500 Subject: asm/user.h: killed unused macros Some of them used to be used by libbfd for a.out coredump handling. Seeing that * libbfd has their copies anyway * we don't export them into userland headers * we don't support a.out coredumps anymore let's bury the definitions. They never had in-kernel users anyway... Signed-off-by: Al Viro --- arch/alpha/include/asm/user.h | 6 ------ arch/arm/include/asm/user.h | 4 ---- arch/h8300/include/asm/user.h | 4 ---- arch/ia64/include/asm/user.h | 6 ------ arch/m68k/include/asm/user.h | 4 ---- arch/powerpc/include/asm/user.h | 5 ----- arch/s390/include/asm/user.h | 4 ---- arch/sh/include/asm/user.h | 6 ------ arch/x86/include/asm/user_32.h | 4 ---- arch/x86/include/asm/user_64.h | 4 ---- 10 files changed, 47 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/user.h b/arch/alpha/include/asm/user.h index 3df37492c7b7..c9f525a6aab8 100644 --- a/arch/alpha/include/asm/user.h +++ b/arch/alpha/include/asm/user.h @@ -45,10 +45,4 @@ struct user { char u_comm[32]; /* user command name */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_DATA_START_ADDR (u.start_data) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) - #endif /* _ALPHA_USER_H */ diff --git a/arch/arm/include/asm/user.h b/arch/arm/include/asm/user.h index c799a3c49342..167d44b550f4 100644 --- a/arch/arm/include/asm/user.h +++ b/arch/arm/include/asm/user.h @@ -77,10 +77,6 @@ struct user{ struct user_fp_struct * u_fp0;/* Used by gdb to help find the values for */ /* the FP registers. */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) /* * User specific VFP registers. If only VFPv2 is present, registers 16 to 31 diff --git a/arch/h8300/include/asm/user.h b/arch/h8300/include/asm/user.h index 2298909f24c6..161653d84b34 100644 --- a/arch/h8300/include/asm/user.h +++ b/arch/h8300/include/asm/user.h @@ -67,9 +67,5 @@ struct user { unsigned long magic; /* To uniquely identify a core file */ char u_comm[32]; /* User command that was responsible */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif diff --git a/arch/ia64/include/asm/user.h b/arch/ia64/include/asm/user.h index 0ba486651b7c..ec03d3ab8715 100644 --- a/arch/ia64/include/asm/user.h +++ b/arch/ia64/include/asm/user.h @@ -50,10 +50,4 @@ struct user { char u_comm[32]; /* user command name */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_DATA_START_ADDR (u.start_data) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) - #endif /* _ASM_IA64_USER_H */ diff --git a/arch/m68k/include/asm/user.h b/arch/m68k/include/asm/user.h index 509d555977c8..61413bff613a 100644 --- a/arch/m68k/include/asm/user.h +++ b/arch/m68k/include/asm/user.h @@ -79,9 +79,5 @@ struct user{ unsigned long magic; /* To uniquely identify a core file */ char u_comm[32]; /* User command that was responsible */ }; -#define NBPG 4096 -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif diff --git a/arch/powerpc/include/asm/user.h b/arch/powerpc/include/asm/user.h index 99443b8594e7..7fae7e597ba4 100644 --- a/arch/powerpc/include/asm/user.h +++ b/arch/powerpc/include/asm/user.h @@ -44,9 +44,4 @@ struct user { char u_comm[32]; /* user command name */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_DATA_START_ADDR (u.start_data) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif /* _ASM_POWERPC_USER_H */ diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h index 0ca572ced21b..8e8aaf48582e 100644 --- a/arch/s390/include/asm/user.h +++ b/arch/s390/include/asm/user.h @@ -67,9 +67,5 @@ struct user { unsigned long magic; /* To uniquely identify a core file */ char u_comm[32]; /* User command that was responsible */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif /* _S390_USER_H */ diff --git a/arch/sh/include/asm/user.h b/arch/sh/include/asm/user.h index 7dfd3f6461e6..12ea0f3f4419 100644 --- a/arch/sh/include/asm/user.h +++ b/arch/sh/include/asm/user.h @@ -52,10 +52,4 @@ struct user { char u_comm[32]; /* user command name */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_DATA_START_ADDR (u.start_data) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) - #endif /* __ASM_SH_USER_H */ diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h index d72c3d66e94f..8963915e533f 100644 --- a/arch/x86/include/asm/user_32.h +++ b/arch/x86/include/asm/user_32.h @@ -124,9 +124,5 @@ struct user{ char u_comm[32]; /* User command that was responsible */ int u_debugreg[8]; }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif /* _ASM_X86_USER_32_H */ diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h index db909923611c..1dd10f07ccd6 100644 --- a/arch/x86/include/asm/user_64.h +++ b/arch/x86/include/asm/user_64.h @@ -130,9 +130,5 @@ struct user { unsigned long error_code; /* CPU error code or 0 */ unsigned long fault_address; /* CR3 or 0 */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif /* _ASM_X86_USER_64_H */ -- cgit v1.2.3 From faf01aef0570757bfbf1d655e984742c1dd38068 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 11 Jan 2022 11:54:04 +1100 Subject: KVM: PPC: Merge powerpc's debugfs entry content into generic entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment KVM on PPC creates 4 types of entries under the kvm debugfs: 1) "%pid-%fd" per a KVM instance (for all platforms); 2) "vm%pid" (for PPC Book3s HV KVM); 3) "vm%u_vcpu%u_timing" (for PPC Book3e KVM); 4) "kvm-xive-%p" (for XIVE PPC Book3s KVM, the same for XICS); The problem with this is that multiple VMs per process is not allowed for 2) and 3) which makes it possible for userspace to trigger errors when creating duplicated debugfs entries. This merges all these into 1). This defines kvm_arch_create_kvm_debugfs() similar to kvm_arch_create_vcpu_debugfs(). This defines 2 hooks in kvmppc_ops that allow specific KVM implementations add necessary entries, this adds the _e500 suffix to kvmppc_create_vcpu_debugfs_e500() to make it clear what platform it is for. This makes use of already existing kvm_arch_create_vcpu_debugfs() on PPC. This removes no more used debugfs_dir pointers from PPC kvm_arch structs. This stops removing vcpu entries as once created vcpus stay around for the entire life of a VM and removed when the KVM instance is closed, see commit d56f5136b010 ("KVM: let kvm_destroy_vm_debugfs clean up vCPU debugfs directories"). Suggested-by: Fabiano Rosas Signed-off-by: Alexey Kardashevskiy Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220111005404.162219-1-aik@ozlabs.ru --- arch/powerpc/include/asm/kvm_host.h | 6 ++---- arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 31 ++++++++++++------------------- arch/powerpc/kvm/book3s_xics.c | 13 ++----------- arch/powerpc/kvm/book3s_xive.c | 13 ++----------- arch/powerpc/kvm/book3s_xive_native.c | 13 ++----------- arch/powerpc/kvm/e500.c | 1 + arch/powerpc/kvm/e500mc.c | 1 + arch/powerpc/kvm/powerpc.c | 16 +++++++++++++--- arch/powerpc/kvm/timing.c | 21 +++++---------------- arch/powerpc/kvm/timing.h | 12 +++++++----- 13 files changed, 51 insertions(+), 82 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d9bf60bf0816..faf301d0dec0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -26,6 +26,8 @@ #include #include +#define __KVM_HAVE_ARCH_VCPU_DEBUGFS + #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -295,7 +297,6 @@ struct kvm_arch { bool dawr1_enabled; pgd_t *pgtable; u64 process_table; - struct dentry *debugfs_dir; struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -673,7 +674,6 @@ struct kvm_vcpu_arch { u64 timing_min_duration[__NUMBER_OF_KVM_EXIT_TYPES]; u64 timing_max_duration[__NUMBER_OF_KVM_EXIT_TYPES]; u64 timing_last_exit; - struct dentry *debugfs_exit_timing; #endif #ifdef CONFIG_PPC_BOOK3S @@ -831,8 +831,6 @@ struct kvm_vcpu_arch { struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ struct kvmhv_tb_accumulator guest_time; /* guest execution */ struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ - - struct dentry *debugfs_dir; #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a14dbcd1b8ce..c583d0c37f31 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -314,6 +314,8 @@ struct kvmppc_ops { int (*svm_off)(struct kvm *kvm); int (*enable_dawr1)(struct kvm *kvm); bool (*hash_v3_possible)(void); + int (*create_vm_debugfs)(struct kvm *kvm); + int (*create_vcpu_debugfs)(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 213232914367..0aeb51738ca9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -2112,7 +2112,7 @@ static const struct file_operations debugfs_htab_fops = { void kvmppc_mmu_debugfs_init(struct kvm *kvm) { - debugfs_create_file("htab", 0400, kvm->arch.debugfs_dir, kvm, + debugfs_create_file("htab", 0400, kvm->debugfs_dentry, kvm, &debugfs_htab_fops); } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 8cebe5542256..e4ce2a35483f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1454,7 +1454,7 @@ static const struct file_operations debugfs_radix_fops = { void kvmhv_radix_debugfs_init(struct kvm *kvm) { - debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm, + debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm, &debugfs_radix_fops); } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 84c89f08ae9a..c7e44d75f8aa 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2767,20 +2767,17 @@ static const struct file_operations debugfs_timings_ops = { }; /* Create a debugfs directory for the vcpu */ -static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { - char buf[16]; - struct kvm *kvm = vcpu->kvm; - - snprintf(buf, sizeof(buf), "vcpu%u", id); - vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); - debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, vcpu, + debugfs_create_file("timings", 0444, debugfs_dentry, vcpu, &debugfs_timings_ops); + return 0; } #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { + return 0; } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ @@ -2903,8 +2900,6 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); - debugfs_vcpu_init(vcpu, id); - return 0; } @@ -5223,7 +5218,6 @@ void kvmppc_free_host_rm_ops(void) static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; - char buf[32]; int ret; mutex_init(&kvm->arch.uvmem_lock); @@ -5356,15 +5350,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.smt_mode = 1; kvm->arch.emul_smt_mode = 1; - /* - * Create a debugfs directory for the VM - */ - snprintf(buf, sizeof(buf), "vm%d", current->pid); - kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); + return 0; +} + +static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm) +{ kvmppc_mmu_debugfs_init(kvm); if (radix_enabled()) kvmhv_radix_debugfs_init(kvm); - return 0; } @@ -5379,8 +5372,6 @@ static void kvmppc_free_vcores(struct kvm *kvm) static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { - debugfs_remove_recursive(kvm->arch.debugfs_dir); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) kvm_hv_vm_deactivated(); @@ -6042,6 +6033,8 @@ static struct kvmppc_ops kvm_ops_hv = { .svm_off = kvmhv_svm_off, .enable_dawr1 = kvmhv_enable_dawr1, .hash_v3_possible = kvmppc_hash_v3_possible, + .create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv, + .create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 9cc466006e8b..306c85e70eea 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1016,19 +1016,10 @@ DEFINE_SHOW_ATTRIBUTE(xics_debug); static void xics_debugfs_init(struct kvmppc_xics *xics) { - char *name; - - name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); - if (!name) { - pr_err("%s: no memory for name\n", __func__); - return; - } - - xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, + xics->dentry = debugfs_create_file("xics", 0444, xics->kvm->debugfs_dentry, xics, &xics_debug_fops); - pr_debug("%s: created %s\n", __func__, name); - kfree(name); + pr_debug("%s: created\n", __func__); } static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e216c068075d..37a56cbb1701 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -2354,19 +2354,10 @@ DEFINE_SHOW_ATTRIBUTE(xive_debug); static void xive_debugfs_init(struct kvmppc_xive *xive) { - char *name; - - name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); - if (!name) { - pr_err("%s: no memory for name\n", __func__); - return; - } - - xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, + xive->dentry = debugfs_create_file("xive", S_IRUGO, xive->kvm->debugfs_dentry, xive, &xive_debug_fops); - pr_debug("%s: created %s\n", __func__, name); - kfree(name); + pr_debug("%s: created\n", __func__); } static void kvmppc_xive_init(struct kvm_device *dev) diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 561a5bfe0468..3c2b128e5f0f 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1259,19 +1259,10 @@ DEFINE_SHOW_ATTRIBUTE(xive_native_debug); static void xive_native_debugfs_init(struct kvmppc_xive *xive) { - char *name; - - name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); - if (!name) { - pr_err("%s: no memory for name\n", __func__); - return; - } - - xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, + xive->dentry = debugfs_create_file("xive", 0444, xive->kvm->debugfs_dentry, xive, &xive_native_debug_fops); - pr_debug("%s: created %s\n", __func__, name); - kfree(name); + pr_debug("%s: created\n", __func__); } static void kvmppc_xive_native_init(struct kvm_device *dev) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 7e8b69015d20..c8b2b4478545 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -495,6 +495,7 @@ static struct kvmppc_ops kvm_ops_e500 = { .emulate_op = kvmppc_core_emulate_op_e500, .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, + .create_vcpu_debugfs = kvmppc_create_vcpu_debugfs_e500, }; static int __init kvmppc_e500_init(void) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 1c189b5aadcc..fa0d8dbbe484 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -381,6 +381,7 @@ static struct kvmppc_ops kvm_ops_e500mc = { .emulate_op = kvmppc_core_emulate_op_e500, .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, + .create_vcpu_debugfs = kvmppc_create_vcpu_debugfs_e500, }; static int __init kvmppc_e500mc_init(void) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 82d889db2b6b..1d06b68739b0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -777,7 +777,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) rcuwait_init(&vcpu->arch.wait); vcpu->arch.waitp = &vcpu->arch.wait; - kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); return 0; out_vcpu_uninit: @@ -794,8 +793,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) /* Make sure we're not using the vcpu anymore */ hrtimer_cancel(&vcpu->arch.dec_timer); - kvmppc_remove_vcpu_debugfs(vcpu); - switch (vcpu->arch.irq_type) { case KVMPPC_IRQ_MPIC: kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); @@ -2521,3 +2518,16 @@ int kvm_arch_init(void *opaque) } EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); + +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) +{ + if (vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs) + vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs(vcpu, debugfs_dentry); +} + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + if (kvm->arch.kvm_ops->create_vm_debugfs) + kvm->arch.kvm_ops->create_vm_debugfs(kvm); + return 0; +} diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index ba56a5cbba97..25071331f8c1 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -204,21 +204,10 @@ static const struct file_operations kvmppc_exit_timing_fops = { .release = single_release, }; -void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id) +int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu, + struct dentry *debugfs_dentry) { - static char dbg_fname[50]; - struct dentry *debugfs_file; - - snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing", - current->pid, id); - debugfs_file = debugfs_create_file(dbg_fname, 0666, kvm_debugfs_dir, - vcpu, &kvmppc_exit_timing_fops); - - vcpu->arch.debugfs_exit_timing = debugfs_file; -} - -void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - debugfs_remove(vcpu->arch.debugfs_exit_timing); - vcpu->arch.debugfs_exit_timing = NULL; + debugfs_create_file("timing", 0666, debugfs_dentry, + vcpu, &kvmppc_exit_timing_fops); + return 0; } diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index feef7885ba82..45817ab82bb4 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -14,8 +14,8 @@ #ifdef CONFIG_KVM_EXIT_TIMING void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu); void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu); -void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id); -void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu); +int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu, + struct dentry *debugfs_dentry); static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) { @@ -26,9 +26,11 @@ static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) /* if exit timing is not configured there is no need to build the c file */ static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {} static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {} -static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, - unsigned int id) {} -static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {} +static inline int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu, + struct dentry *debugfs_dentry) +{ + return 0; +} static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {} #endif /* CONFIG_KVM_EXIT_TIMING */ -- cgit v1.2.3 From a1c414093370ed50e5b952d96d4ae775c7a18420 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sun, 21 Mar 2021 03:09:32 +0530 Subject: powerpc/epapr: Fix parmeters typo s/parmeters/parameters/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210320213932.22697-1-unixbhaskar@gmail.com --- arch/powerpc/include/asm/epapr_hcalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index c99ba08a408d..cdf3c6df5123 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -65,7 +65,7 @@ * but the gcc inline assembly syntax does not allow us to specify registers * on the clobber list that are also on the input/output list. Therefore, * the lists of clobbered registers depends on the number of register - * parmeters ("+r" and "=r") passed to the hypercall. + * parameters ("+r" and "=r") passed to the hypercall. * * Each assembly block should use one of the HCALL_CLOBBERSx macros. As a * general rule, 'x' is the number of parameters passed to the assembly -- cgit v1.2.3 From 535bda36dbf2d271f59e06fe252c32eff452666d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 31 Jan 2022 08:16:48 +0000 Subject: powerpc/nohash: Remove pte_same() arch/powerpc/include/asm/nohash/{32/64}/pgtable.h has #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) ((pte_val(A) ^ pte_val(B)) == 0) include/linux/pgtable.h has #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif Remove the powerpc version which is similar to the generic one. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/83c97bd58a3596ef1b0ff28b1e41fd492d005520.1643616989.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 3 --- arch/powerpc/include/asm/nohash/64/pgtable.h | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index d959c2a73fbf..a0525765c7bb 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -338,9 +338,6 @@ static inline int pte_young(pte_t pte) return pte_val(pte) & _PAGE_ACCESSED; } -#define __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) ((pte_val(A) ^ pte_val(B)) == 0) - /* * Note that on Book E processors, the pmd contains the kernel virtual * (lowmem) address of the pte page. The physical address is less useful diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2816d158280a..a441056b3eba 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -281,9 +281,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, flush_tlb_page(vma, address); } -#define __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) ((pte_val(A) ^ pte_val(B)) == 0) - #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ -- cgit v1.2.3 From 4291d085b0b07a78403e845c187428b038c901cd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 30 Jan 2022 10:29:34 +0000 Subject: powerpc/32s: Make pte_update() non atomic on 603 core On 603 core, TLB miss handler don't do any change to the page tables so pte_update() doesn't need to be atomic. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cc89d3c11fc9c742d0df3454a657a3a00be24046.1643538554.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 37 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index f8b94f78403f..772e00dc4ef1 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -298,28 +298,35 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p unsigned long clr, unsigned long set, int huge) { pte_basic_t old; - unsigned long tmp; - __asm__ __volatile__( + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + unsigned long tmp; + + asm volatile( #ifndef CONFIG_PTE_64BIT -"1: lwarx %0, 0, %3\n" -" andc %1, %0, %4\n" + "1: lwarx %0, 0, %3\n" + " andc %1, %0, %4\n" #else -"1: lwarx %L0, 0, %3\n" -" lwz %0, -4(%3)\n" -" andc %1, %L0, %4\n" + "1: lwarx %L0, 0, %3\n" + " lwz %0, -4(%3)\n" + " andc %1, %L0, %4\n" #endif -" or %1, %1, %5\n" -" stwcx. %1, 0, %3\n" -" bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*p) + " or %1, %1, %5\n" + " stwcx. %1, 0, %3\n" + " bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*p) #ifndef CONFIG_PTE_64BIT - : "r" (p), + : "r" (p), #else - : "b" ((unsigned long)(p) + 4), + : "b" ((unsigned long)(p) + 4), #endif - "r" (clr), "r" (set), "m" (*p) - : "cc" ); + "r" (clr), "r" (set), "m" (*p) + : "cc" ); + } else { + old = pte_val(*p); + + *p = __pte((old & ~(pte_basic_t)clr) | set); + } return old; } -- cgit v1.2.3 From d6a6c725a20467f52a41270bdaad9565c66f3b7a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 3 Sep 2021 11:18:38 +0000 Subject: powerpc/machdep: Remove CONFIG_PPC_HAS_FEATURE_CALLS Last user was removed by commit 7bbd827750e6 ("[PATCH] ppc64: very basic desktop g5 sound support"). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/803779fffb4ee0801746b2173d37cea3b273f821.1630667612.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/machdep.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index e821037f74f0..75687e1f994a 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -10,11 +10,6 @@ #include -/* We export this macro for external modules like Alsa to know if - * ppc_md.feature_call is implemented or not - */ -#define CONFIG_PPC_HAS_FEATURE_CALLS - struct pt_regs; struct pci_bus; struct device_node; -- cgit v1.2.3 From e6d03ac156db84422519aa8628efc210d24bf889 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 3 Sep 2021 11:18:40 +0000 Subject: powerpc/machdep: Move sys_ctrler_t definition into pmac_feature.h sys_ctrler_t definitions are tied to pmac. Move it into pmac_feature.h Signed-off-by: Christophe Leroy [mpe: Move to pmac_feature.h to fix some build errors] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7dd5ead4bbca749e2da089ff6fe2b1878d6bf40e.1630667612.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/machdep.h | 15 --------------- arch/powerpc/include/asm/pmac_feature.h | 12 ++++++++++++ arch/powerpc/platforms/powermac/pmac.h | 2 ++ drivers/macintosh/via-cuda.c | 1 + sound/ppc/pmac.h | 1 + 5 files changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 75687e1f994a..06ac7ef07c85 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -230,21 +230,6 @@ extern struct machdep_calls *machine_id; machine_id == &mach_##name; \ }) -#ifdef CONFIG_PPC_PMAC -/* - * Power macintoshes have either a CUDA, PMU or SMU controlling - * system reset, power, NVRAM, RTC. - */ -typedef enum sys_ctrler_kind { - SYS_CTRLER_UNKNOWN = 0, - SYS_CTRLER_CUDA = 1, - SYS_CTRLER_PMU = 2, - SYS_CTRLER_SMU = 3, -} sys_ctrler_t; -extern sys_ctrler_t sys_ctrler; - -#endif /* CONFIG_PPC_PMAC */ - static inline void log_error(char *buf, unsigned int err_type, int fatal) { if (ppc_md.log_error) diff --git a/arch/powerpc/include/asm/pmac_feature.h b/arch/powerpc/include/asm/pmac_feature.h index e08e829261b6..2495866f2e97 100644 --- a/arch/powerpc/include/asm/pmac_feature.h +++ b/arch/powerpc/include/asm/pmac_feature.h @@ -401,5 +401,17 @@ extern u32 __iomem *uninorth_base; */ extern int pmac_get_uninorth_variant(void); +/* + * Power macintoshes have either a CUDA, PMU or SMU controlling + * system reset, power, NVRAM, RTC. + */ +typedef enum sys_ctrler_kind { + SYS_CTRLER_UNKNOWN = 0, + SYS_CTRLER_CUDA = 1, + SYS_CTRLER_PMU = 2, + SYS_CTRLER_SMU = 3, +} sys_ctrler_t; +extern sys_ctrler_t sys_ctrler; + #endif /* __ASM_POWERPC_PMAC_FEATURE_H */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/platforms/powermac/pmac.h b/arch/powerpc/platforms/powermac/pmac.h index 29d2036dcc9d..ba8d4e97095b 100644 --- a/arch/powerpc/platforms/powermac/pmac.h +++ b/arch/powerpc/platforms/powermac/pmac.h @@ -5,6 +5,8 @@ #include #include +#include + /* * Declaration for the various functions exported by the * pmac_* files. Mostly for use by pmac_setup diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c index cd267392289c..3d0d0b9d471d 100644 --- a/drivers/macintosh/via-cuda.c +++ b/drivers/macintosh/via-cuda.c @@ -21,6 +21,7 @@ #ifdef CONFIG_PPC #include #include +#include #else #include #include diff --git a/sound/ppc/pmac.h b/sound/ppc/pmac.h index a758caf689d2..b6f454130463 100644 --- a/sound/ppc/pmac.h +++ b/sound/ppc/pmac.h @@ -26,6 +26,7 @@ #include #include #include +#include /* maximum number of fragments */ #define PMAC_MAX_FRAGS 32 -- cgit v1.2.3 From 12318163737cd8808d13faa6e2393774191a6182 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Nov 2021 13:04:49 +0100 Subject: powerpc/32: Remove remaining .stabs annotations STABS debug format has been superseded long time ago by DWARF. Remove the few remaining .stabs annotations from old 32 bits code. Reported-by: kernel test robot Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/68932ec2ba6b868d35006b96e90f0890f3da3c05.1638273868.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 1 - arch/powerpc/kernel/head_book3s_32.S | 3 --- arch/powerpc/lib/checksum_32.S | 3 --- arch/powerpc/lib/copy_32.S | 3 --- 4 files changed, 10 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index f21e6bde17a1..c4b074263bf9 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -208,7 +208,6 @@ GLUE(.,name): n: #define _GLOBAL(n) \ - .stabs __stringify(n:F-1),N_FUN,0,0,n;\ .globl n; \ n: diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index d489b965f9a6..94f88dc8269d 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -50,9 +50,6 @@ mtspr SPRN_DBAT##n##L,RB __HEAD - .stabs "arch/powerpc/kernel/",N_SO,0,0,0f - .stabs "head_book3s_32.S",N_SO,0,0,0f -0: _ENTRY(_stext); /* diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S index 27d9070617df..4541e8e29467 100644 --- a/arch/powerpc/lib/checksum_32.S +++ b/arch/powerpc/lib/checksum_32.S @@ -116,9 +116,6 @@ EXPORT_SYMBOL(__csum_partial) EX_TABLE(8 ## n ## 7b, fault); .text - .stabs "arch/powerpc/lib/",N_SO,0,0,0f - .stabs "checksum_32.S",N_SO,0,0,0f -0: CACHELINE_BYTES = L1_CACHE_BYTES LG_CACHELINE_BYTES = L1_CACHE_SHIFT diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index a3bcf4786e4a..3e9c27c46331 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -57,9 +57,6 @@ EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) .text - .stabs "arch/powerpc/lib/",N_SO,0,0,0f - .stabs "copy_32.S",N_SO,0,0,0f -0: CACHELINE_BYTES = L1_CACHE_BYTES LG_CACHELINE_BYTES = L1_CACHE_SHIFT -- cgit v1.2.3 From 27e21e8f128a56d3462f0fe2fd3a59c02cc002b1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Nov 2021 13:04:50 +0100 Subject: powerpc/32: Remove _ENTRY() macro _ENTRY() is now redundant with _GLOBAL(). Remove it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/62a35f8dde2bb74c8d0d7a5430cce07a5a3a6fb6.1638273868.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 4 ---- arch/powerpc/kernel/head_40x.S | 18 +++++++++--------- arch/powerpc/kernel/head_44x.S | 4 ++-- arch/powerpc/kernel/head_8xx.S | 4 ++-- arch/powerpc/kernel/head_book3s_32.S | 8 ++++---- arch/powerpc/kernel/head_fsl_booke.S | 6 +++--- 6 files changed, 20 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index c4b074263bf9..3c06a33b5da4 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -203,10 +203,6 @@ GLUE(.,name): #else /* 32-bit */ -#define _ENTRY(n) \ - .globl n; \ -n: - #define _GLOBAL(n) \ .globl n; \ n: diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index b6c6d1de5fd5..088f500896c7 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -53,8 +53,8 @@ * This is all going to change RSN when we add bi_recs....... -- Dan */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); mr r31,r3 /* save device tree ptr */ @@ -82,19 +82,19 @@ turn_on_mmu: */ . = 0xc0 crit_save: -_ENTRY(crit_r10) +_GLOBAL(crit_r10) .space 4 -_ENTRY(crit_r11) +_GLOBAL(crit_r11) .space 4 -_ENTRY(crit_srr0) +_GLOBAL(crit_srr0) .space 4 -_ENTRY(crit_srr1) +_GLOBAL(crit_srr1) .space 4 -_ENTRY(crit_r1) +_GLOBAL(crit_r1) .space 4 -_ENTRY(crit_dear) +_GLOBAL(crit_dear) .space 4 -_ENTRY(crit_esr) +_GLOBAL(crit_esr) .space 4 /* diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index b73a56466903..f15cb9fdb692 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -52,8 +52,8 @@ * */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 0d073b9fd52c..0b05f2be66b9 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -53,8 +53,8 @@ #define PAGE_SHIFT_8M 23 __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* MPC8xx * This port was done on an MBX board with an 860. Right now I only diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 94f88dc8269d..519b60695167 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -50,13 +50,13 @@ mtspr SPRN_DBAT##n##L,RB __HEAD -_ENTRY(_stext); +_GLOBAL(_stext); /* * _start is defined this way because the XCOFF loader in the OpenFirmware * on the powermac expects the entry point to be a procedure descriptor. */ -_ENTRY(_start); +_GLOBAL(_start); /* * These are here for legacy reasons, the kernel used to * need to look like a coff function entry for the pmac @@ -775,7 +775,7 @@ relocate_kernel: * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5. */ -_ENTRY(copy_and_flush) +_GLOBAL(copy_and_flush) addi r5,r5,-4 addi r6,r6,-4 4: li r0,L1_CACHE_BYTES/4 @@ -1073,7 +1073,7 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr -_ENTRY(update_bats) +_GLOBAL(update_bats) lis r4, 1f@h ori r4, r4, 1f@l tophys(r4, r4) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index ac2b4dcf5fd3..f0db4f52bc00 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -54,8 +54,8 @@ * */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs @@ -154,7 +154,7 @@ _ENTRY(_start); * if needed */ -_ENTRY(__early_start) +_GLOBAL(__early_start) LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) lwz r20,0(r20) -- cgit v1.2.3 From a4520b25276500f1abcfc55d24f1251b7b08eff6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:12 +0000 Subject: powerpc/ftrace: Add support for livepatch to PPC32 PPC64 needs some special logic to properly set up the TOC. See commit 85baa095497f ("powerpc/livepatch: Add live patching support on ppc64le") for details. PPC32 doesn't have TOC so it doesn't need that logic, so adding LIVEPATCH support is straight forward. Add CONFIG_LIVEPATCH_64 and move livepatch stack logic into that item. Livepatch sample modules all work. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/63cb094125b6a6038c65eeac2abaabbabe63addd.1640017960.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 6 +++++- arch/powerpc/include/asm/livepatch.h | 8 +++++--- arch/powerpc/include/asm/thread_info.h | 2 +- arch/powerpc/kernel/asm-offsets.c | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b779603978e1..1d027ce94ab1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -9,6 +9,10 @@ config 64BIT bool default y if PPC64 +config LIVEPATCH_64 + def_bool PPC64 + depends on LIVEPATCH + config MMU bool default y @@ -221,7 +225,7 @@ config PPC select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS && PPC64 + select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_OPTPROBES diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 4fe018cc207b..37af961eb74c 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -23,12 +23,14 @@ static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) static inline unsigned long klp_get_ftrace_location(unsigned long faddr) { /* - * Live patch works only with -mprofile-kernel on PPC. In this case, - * the ftrace location is always within the first 16 bytes. + * Live patch works on PPC32 and only with -mprofile-kernel on PPC64. In + * both cases, the ftrace location is always within the first 16 bytes. */ return ftrace_location_range(faddr, faddr + 16); } +#endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { /* + 1 to account for STACK_END_MAGIC */ @@ -36,6 +38,6 @@ static inline void klp_init_thread_info(struct task_struct *p) } #else static inline void klp_init_thread_info(struct task_struct *p) { } -#endif /* CONFIG_LIVEPATCH */ +#endif #endif /* _ASM_POWERPC_LIVEPATCH_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index d6e649b3c70b..125328d1b980 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -51,7 +51,7 @@ struct thread_info { unsigned int cpu; #endif unsigned long local_flags; /* private flags for thread */ -#ifdef CONFIG_LIVEPATCH +#ifdef CONFIG_LIVEPATCH_64 unsigned long *livepatch_sp; #endif #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7582f3e3a330..eec536aef83a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -94,7 +94,7 @@ int main(void) OFFSET(TASK_CPU, task_struct, thread_info.cpu); #endif -#ifdef CONFIG_LIVEPATCH +#ifdef CONFIG_LIVEPATCH_64 OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); #endif -- cgit v1.2.3 From d95bf254be5f74c1e4c8f7cb64e2e21b9cc91717 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:22 +0000 Subject: powerpc/ftrace: Prepare PPC32's ftrace_caller() for CONFIG_DYNAMIC_FTRACE_WITH_ARGS In order to implement CONFIG_DYNAMIC_FTRACE_WITH_ARGS, change ftrace_caller() stack layout to match struct pt_regs. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/da9734eba504998fb914aca12131c9f6bf6120a8.1640017960.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ftrace.h | 39 +---------------------------------- arch/powerpc/kernel/trace/ftrace_32.S | 29 ++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 42 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index debe8c4f7062..b3f6184f77ea 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -10,44 +10,7 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR -#ifdef __ASSEMBLY__ - -/* Based off of objdump output from glibc */ - -#define MCOUNT_SAVE_FRAME \ - stwu r1,-48(r1); \ - stw r3, 12(r1); \ - stw r4, 16(r1); \ - stw r5, 20(r1); \ - stw r6, 24(r1); \ - mflr r3; \ - lwz r4, 52(r1); \ - mfcr r5; \ - stw r7, 28(r1); \ - stw r8, 32(r1); \ - stw r9, 36(r1); \ - stw r10,40(r1); \ - stw r3, 44(r1); \ - stw r5, 8(r1) - -#define MCOUNT_RESTORE_FRAME \ - lwz r6, 8(r1); \ - lwz r0, 44(r1); \ - lwz r3, 12(r1); \ - mtctr r0; \ - lwz r4, 16(r1); \ - mtcr r6; \ - lwz r5, 20(r1); \ - lwz r6, 24(r1); \ - lwz r0, 52(r1); \ - lwz r7, 28(r1); \ - lwz r8, 32(r1); \ - mtlr r0; \ - lwz r9, 36(r1); \ - lwz r10,40(r1); \ - addi r1, r1, 48 - -#else /* !__ASSEMBLY__ */ +#ifndef __ASSEMBLY__ extern void _mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 95ffea2bdc29..c4055b41af5f 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -27,17 +27,38 @@ _GLOBAL(_mcount) EXPORT_SYMBOL(_mcount) _GLOBAL(ftrace_caller) - MCOUNT_SAVE_FRAME - /* r3 ends up with link register */ + stwu r1, -INT_FRAME_SIZE(r1) + + SAVE_GPRS(3, 10, r1) + + addi r8, r1, INT_FRAME_SIZE + stw r8, GPR1(r1) + + mflr r3 + stw r3, _NIP(r1) subi r3, r3, MCOUNT_INSN_SIZE + + stw r0, _LINK(r1) + mr r4, r0 + lis r5,function_trace_op@ha lwz r5,function_trace_op@l(r5) - li r6, 0 + + addi r6, r1, STACK_FRAME_OVERHEAD .globl ftrace_call ftrace_call: bl ftrace_stub nop - MCOUNT_RESTORE_FRAME + + lwz r3, _NIP(r1) + mtctr r3 + + REST_GPRS(3, 10, r1) + + lwz r0, _LINK(r1) + mtlr r0 + + addi r1, r1, INT_FRAME_SIZE ftrace_caller_common: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call -- cgit v1.2.3 From 40b035efe288f42bbf4483236cde652584ccb64e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:28 +0000 Subject: powerpc/ftrace: Implement CONFIG_DYNAMIC_FTRACE_WITH_ARGS Implement CONFIG_DYNAMIC_FTRACE_WITH_ARGS. It accelerates the call of livepatching. Also note that powerpc being the last one to convert to CONFIG_DYNAMIC_FTRACE_WITH_ARGS, it will now be possible to remove klp_arch_set_pc() on all architectures. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5831f711a778fcd6eb51eb5898f1faae4378b35b.1640017960.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ftrace.h | 17 +++++++++++++++++ arch/powerpc/include/asm/livepatch.h | 4 +--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1d027ce94ab1..2a851d31f120 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -201,6 +201,7 @@ config PPC select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS if MPROFILE_KERNEL || PPC32 select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b3f6184f77ea..45c3d6f11daa 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -22,6 +22,23 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) struct dyn_arch_ftrace { struct module *mod; }; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &fregs->regs; +} + +static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, + unsigned long ip) +{ + regs_set_return_ip(&fregs->regs, ip); +} +#endif #endif /* __ASSEMBLY__ */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 37af961eb74c..6f10de6af6e3 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -14,9 +14,7 @@ #ifdef CONFIG_LIVEPATCH static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - struct pt_regs *regs = ftrace_get_regs(fregs); - - regs_set_return_ip(regs, ip); + ftrace_instruction_pointer_set(fregs, ip); } #define klp_get_ftrace_location klp_get_ftrace_location -- cgit v1.2.3 From 830213786c498b0c488fedd2abc15a7ce442b42f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:35 +0000 Subject: powerpc/ftrace: directly call of function graph tracer by ftrace caller Modify function graph tracer to be handled directly by the standard ftrace caller. This is made possible as powerpc now supports CONFIG_DYNAMIC_FTRACE_WITH_ARGS. This change simplifies the call of function graph ftrace. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/04d196585ff81bde06a000bd9c633a33a5b21130.1640017960.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ftrace.h | 6 +++ arch/powerpc/kernel/trace/ftrace.c | 11 +++++ arch/powerpc/kernel/trace/ftrace_32.S | 53 +-------------------- arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 64 +------------------------- 4 files changed, 20 insertions(+), 114 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 45c3d6f11daa..70b457097098 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -38,6 +38,12 @@ static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *f { regs_set_return_ip(&fregs->regs, ip); } + +struct ftrace_ops; + +#define ftrace_graph_func ftrace_graph_func +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index ce673764cb69..74a176e394ef 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -917,6 +917,9 @@ static int ftrace_modify_ftrace_graph_caller(bool enable) unsigned long stub = (unsigned long)(&ftrace_graph_stub); ppc_inst_t old, new; + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)) + return 0; + old = ftrace_call_replace(ip, enable ? stub : addr, 0); new = ftrace_call_replace(ip, enable ? addr : stub, 0); @@ -955,6 +958,14 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, out: return parent; } + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + fregs->regs.link = prepare_ftrace_return(parent_ip, ip, fregs->regs.gpr[1]); +} +#endif #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef PPC64_ELF_ABI_v1 diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index c4055b41af5f..2b425da97a6b 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -59,13 +59,6 @@ ftrace_call: mtlr r0 addi r1, r1, INT_FRAME_SIZE -ftrace_caller_common: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif /* old link register ends up in ctr reg */ bctr @@ -135,52 +128,10 @@ ftrace_regs_call: /* Pop our stack frame */ addi r1, r1, INT_FRAME_SIZE - - b ftrace_caller_common - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(ftrace_graph_caller) - stwu r1,-48(r1) - stw r3, 12(r1) - stw r4, 16(r1) - stw r5, 20(r1) - stw r6, 24(r1) - stw r7, 28(r1) - stw r8, 32(r1) - stw r9, 36(r1) - stw r10,40(r1) - - addi r5, r1, 48 - mfctr r4 /* ftrace_caller has moved local addr here */ - stw r4, 44(r1) - mflr r3 /* ftrace_caller has restored LR from stack */ - subi r4, r4, MCOUNT_INSN_SIZE - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR in the callers stack frame to this. - */ - stw r3,52(r1) - mtlr r3 - lwz r0,44(r1) - mtctr r0 - - lwz r3, 12(r1) - lwz r4, 16(r1) - lwz r5, 20(r1) - lwz r6, 24(r1) - lwz r7, 28(r1) - lwz r8, 32(r1) - lwz r9, 36(r1) - lwz r10,40(r1) - - addi r1, r1, 48 - + /* old link register ends up in ctr reg */ bctr +#ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(return_to_handler) /* need to save return values */ stwu r1, -16(r1) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index f6f787819273..6071e0122797 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -124,15 +124,6 @@ ftrace_regs_call: /* Based on the cmpd above, if the NIP was altered handle livepatch */ bne- livepatch_handler #endif - -ftrace_caller_common: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif - bctr /* jump after _mcount site */ _GLOBAL(ftrace_stub) @@ -216,8 +207,7 @@ ftrace_call: /* Based on the cmpd above, if the NIP was altered handle livepatch */ bne- livepatch_handler #endif - /* Handle function_graph or go back */ - b ftrace_caller_common + bctr /* jump after _mcount site */ #ifdef CONFIG_LIVEPATCH /* @@ -286,55 +276,3 @@ livepatch_handler: /* Return to original caller of live patched function */ blr #endif /* CONFIG_LIVEPATCH */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(ftrace_graph_caller) - stdu r1, -112(r1) - /* with -mprofile-kernel, parameter regs are still alive at _mcount */ - std r10, 104(r1) - std r9, 96(r1) - std r8, 88(r1) - std r7, 80(r1) - std r6, 72(r1) - std r5, 64(r1) - std r4, 56(r1) - std r3, 48(r1) - - /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2, PACATOC(r13) /* get kernel TOC in r2 */ - - addi r5, r1, 112 - mfctr r4 /* ftrace_caller has moved local addr here */ - std r4, 40(r1) - mflr r3 /* ftrace_caller has restored LR from stack */ - subi r4, r4, MCOUNT_INSN_SIZE - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR to this. - */ - mtlr r3 - - ld r0, 40(r1) - mtctr r0 - ld r10, 104(r1) - ld r9, 96(r1) - ld r8, 88(r1) - ld r7, 80(r1) - ld r6, 72(r1) - ld r5, 64(r1) - ld r4, 56(r1) - ld r3, 48(r1) - - /* Restore callee's TOC */ - ld r2, 24(r1) - - addi r1, r1, 112 - mflr r0 - std r0, LRSAVE(r1) - bctr -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 297565aa22cfa80ab0f88c3569693aea0b6afb6d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 5 Feb 2022 16:23:45 +0100 Subject: lib/xor: make xor prototypes more friendly to compiler vectorization Modern compilers are perfectly capable of extracting parallelism from the XOR routines, provided that the prototypes reflect the nature of the input accurately, in particular, the fact that the input vectors are expected not to overlap. This is not documented explicitly, but is implied by the interchangeability of the various C routines, some of which use temporary variables while others don't: this means that these routines only behave identically for non-overlapping inputs. So let's decorate these input vectors with the __restrict modifier, which informs the compiler that there is no overlap. While at it, make the input-only vectors pointer-to-const as well. Tested-by: Nathan Chancellor Signed-off-by: Ard Biesheuvel Reviewed-by: Nick Desaulniers Link: https://github.com/ClangBuiltLinux/linux/issues/563 Signed-off-by: Herbert Xu --- arch/alpha/include/asm/xor.h | 53 ++++++++++++++------- arch/arm/include/asm/xor.h | 42 +++++++++++------ arch/arm64/include/asm/xor.h | 21 ++++++--- arch/arm64/lib/xor-neon.c | 46 ++++++++++++------- arch/ia64/include/asm/xor.h | 21 ++++++--- arch/powerpc/include/asm/xor_altivec.h | 25 +++++----- arch/powerpc/lib/xor_vmx.c | 28 ++++++++---- arch/powerpc/lib/xor_vmx.h | 27 +++++------ arch/powerpc/lib/xor_vmx_glue.c | 32 +++++++------ arch/s390/lib/xor.c | 21 ++++++--- arch/sparc/include/asm/xor_32.h | 21 ++++++--- arch/sparc/include/asm/xor_64.h | 42 +++++++++++------ arch/x86/include/asm/xor.h | 42 +++++++++++------ arch/x86/include/asm/xor_32.h | 42 +++++++++++------ arch/x86/include/asm/xor_avx.h | 21 ++++++--- include/asm-generic/xor.h | 84 ++++++++++++++++++++++------------ include/linux/raid/xor.h | 21 ++++++--- 17 files changed, 381 insertions(+), 208 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h index 5aeb4fb3cb7c..e0de0c233ab9 100644 --- a/arch/alpha/include/asm/xor.h +++ b/arch/alpha/include/asm/xor.h @@ -5,24 +5,43 @@ * Optimized RAID-5 checksumming functions for alpha EV5 and EV6 */ -extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *); -extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *, - unsigned long *); -extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *); -extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *, unsigned long *); +extern void +xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +extern void +xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +extern void +xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +extern void +xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); -extern void xor_alpha_prefetch_2(unsigned long, unsigned long *, - unsigned long *); -extern void xor_alpha_prefetch_3(unsigned long, unsigned long *, - unsigned long *, unsigned long *); -extern void xor_alpha_prefetch_4(unsigned long, unsigned long *, - unsigned long *, unsigned long *, - unsigned long *); -extern void xor_alpha_prefetch_5(unsigned long, unsigned long *, - unsigned long *, unsigned long *, - unsigned long *, unsigned long *); +extern void +xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +extern void +xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +extern void +xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +extern void +xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); asm(" \n\ .text \n\ diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index aefddec79286..669cad5194d3 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h @@ -44,7 +44,8 @@ : "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4)) static void -xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned int lines = bytes / sizeof(unsigned long) / 4; register unsigned int a1 __asm__("r4"); @@ -64,8 +65,9 @@ xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned int lines = bytes / sizeof(unsigned long) / 4; register unsigned int a1 __asm__("r4"); @@ -86,8 +88,10 @@ xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned int lines = bytes / sizeof(unsigned long) / 2; register unsigned int a1 __asm__("r8"); @@ -105,8 +109,11 @@ xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_arm4regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { unsigned int lines = bytes / sizeof(unsigned long) / 2; register unsigned int a1 __asm__("r8"); @@ -146,7 +153,8 @@ static struct xor_block_template xor_block_arm4regs = { extern struct xor_block_template const xor_block_neon_inner; static void -xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { if (in_interrupt()) { xor_arm4regs_2(bytes, p1, p2); @@ -158,8 +166,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { if (in_interrupt()) { xor_arm4regs_3(bytes, p1, p2, p3); @@ -171,8 +180,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { if (in_interrupt()) { xor_arm4regs_4(bytes, p1, p2, p3, p4); @@ -184,8 +195,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { if (in_interrupt()) { xor_arm4regs_5(bytes, p1, p2, p3, p4, p5); diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h index 947f6a4f1aa0..befcd8a7abc9 100644 --- a/arch/arm64/include/asm/xor.h +++ b/arch/arm64/include/asm/xor.h @@ -16,7 +16,8 @@ extern struct xor_block_template const xor_block_inner_neon; static void -xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { kernel_neon_begin(); xor_block_inner_neon.do_2(bytes, p1, p2); @@ -24,8 +25,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { kernel_neon_begin(); xor_block_inner_neon.do_3(bytes, p1, p2, p3); @@ -33,8 +35,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { kernel_neon_begin(); xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4); @@ -42,8 +46,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { kernel_neon_begin(); xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5); diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c index d189cf4e70ea..96b171995d19 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/arch/arm64/lib/xor-neon.c @@ -10,8 +10,8 @@ #include #include -void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, - unsigned long *p2) +void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -37,8 +37,9 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3) +void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -72,8 +73,10 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3, unsigned long *p4) +void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -115,9 +118,11 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3, - unsigned long *p4, unsigned long *p5) +void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -186,8 +191,10 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) return res; } -static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3) +static void xor_arm64_eor3_3(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -219,9 +226,11 @@ static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3, - unsigned long *p4) +static void xor_arm64_eor3_4(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -261,9 +270,12 @@ static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1, - unsigned long *p2, unsigned long *p3, - unsigned long *p4, unsigned long *p5) +static void xor_arm64_eor3_5(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; diff --git a/arch/ia64/include/asm/xor.h b/arch/ia64/include/asm/xor.h index 673051bf9d7d..6785f70d3208 100644 --- a/arch/ia64/include/asm/xor.h +++ b/arch/ia64/include/asm/xor.h @@ -4,13 +4,20 @@ */ -extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *); -extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *, - unsigned long *); -extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *); -extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *, unsigned long *); +extern void xor_ia64_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +extern void xor_ia64_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +extern void xor_ia64_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +extern void xor_ia64_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); static struct xor_block_template xor_block_ia64 = { .name = "ia64", diff --git a/arch/powerpc/include/asm/xor_altivec.h b/arch/powerpc/include/asm/xor_altivec.h index 6ca923510b59..294620a25f80 100644 --- a/arch/powerpc/include/asm/xor_altivec.h +++ b/arch/powerpc/include/asm/xor_altivec.h @@ -3,17 +3,20 @@ #define _ASM_POWERPC_XOR_ALTIVEC_H #ifdef CONFIG_ALTIVEC - -void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in); -void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in); -void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in); -void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in); +void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); #endif #endif /* _ASM_POWERPC_XOR_ALTIVEC_H */ diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index 54e61979e80e..aab49d056d18 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -49,8 +49,9 @@ typedef vector signed char unative_t; V1##_3 = vec_xor(V1##_3, V2##_3); \ } while (0) -void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in) +void __xor_altivec_2(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in) { DEFINE(v1); DEFINE(v2); @@ -67,8 +68,10 @@ void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, } while (--lines > 0); } -void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in) +void __xor_altivec_3(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in) { DEFINE(v1); DEFINE(v2); @@ -89,9 +92,11 @@ void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, } while (--lines > 0); } -void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in) +void __xor_altivec_4(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in) { DEFINE(v1); DEFINE(v2); @@ -116,9 +121,12 @@ void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, } while (--lines > 0); } -void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in) +void __xor_altivec_5(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in, + const unsigned long * __restrict v5_in) { DEFINE(v1); DEFINE(v2); diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h index 5c2b0839b179..573c41d90dac 100644 --- a/arch/powerpc/lib/xor_vmx.h +++ b/arch/powerpc/lib/xor_vmx.h @@ -6,16 +6,17 @@ * outside of the enable/disable altivec block. */ -void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in); - -void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in); - -void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in); - -void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in); +void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c index 80dba916c367..35d917ece4d1 100644 --- a/arch/powerpc/lib/xor_vmx_glue.c +++ b/arch/powerpc/lib/xor_vmx_glue.c @@ -12,47 +12,51 @@ #include #include "xor_vmx.h" -void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in) +void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { preempt_disable(); enable_kernel_altivec(); - __xor_altivec_2(bytes, v1_in, v2_in); + __xor_altivec_2(bytes, p1, p2); disable_kernel_altivec(); preempt_enable(); } EXPORT_SYMBOL(xor_altivec_2); -void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in) +void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { preempt_disable(); enable_kernel_altivec(); - __xor_altivec_3(bytes, v1_in, v2_in, v3_in); + __xor_altivec_3(bytes, p1, p2, p3); disable_kernel_altivec(); preempt_enable(); } EXPORT_SYMBOL(xor_altivec_3); -void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in) +void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { preempt_disable(); enable_kernel_altivec(); - __xor_altivec_4(bytes, v1_in, v2_in, v3_in, v4_in); + __xor_altivec_4(bytes, p1, p2, p3, p4); disable_kernel_altivec(); preempt_enable(); } EXPORT_SYMBOL(xor_altivec_4); -void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in) +void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { preempt_disable(); enable_kernel_altivec(); - __xor_altivec_5(bytes, v1_in, v2_in, v3_in, v4_in, v5_in); + __xor_altivec_5(bytes, p1, p2, p3, p4, p5); disable_kernel_altivec(); preempt_enable(); } diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c index a963c3d8ad0d..fb924a8041dc 100644 --- a/arch/s390/lib/xor.c +++ b/arch/s390/lib/xor.c @@ -11,7 +11,8 @@ #include #include -static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { asm volatile( " larl 1,2f\n" @@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : "0", "1", "cc", "memory"); } -static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { asm volatile( " larl 1,2f\n" @@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "0", "1", "cc", "memory"); } -static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { asm volatile( " larl 1,2f\n" @@ -88,8 +92,11 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "0", "1", "cc", "memory"); } -static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { asm volatile( " larl 1,2f\n" diff --git a/arch/sparc/include/asm/xor_32.h b/arch/sparc/include/asm/xor_32.h index 3e5af37e4b9c..0351813cf3af 100644 --- a/arch/sparc/include/asm/xor_32.h +++ b/arch/sparc/include/asm/xor_32.h @@ -13,7 +13,8 @@ */ static void -sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +sparc_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { int lines = bytes / (sizeof (long)) / 8; @@ -50,8 +51,9 @@ sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +sparc_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { int lines = bytes / (sizeof (long)) / 8; @@ -101,8 +103,10 @@ sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +sparc_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { int lines = bytes / (sizeof (long)) / 8; @@ -165,8 +169,11 @@ sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +sparc_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { int lines = bytes / (sizeof (long)) / 8; diff --git a/arch/sparc/include/asm/xor_64.h b/arch/sparc/include/asm/xor_64.h index 16169f3edcd5..caaddea8ad79 100644 --- a/arch/sparc/include/asm/xor_64.h +++ b/arch/sparc/include/asm/xor_64.h @@ -12,13 +12,20 @@ #include -void xor_vis_2(unsigned long, unsigned long *, unsigned long *); -void xor_vis_3(unsigned long, unsigned long *, unsigned long *, - unsigned long *); -void xor_vis_4(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *); -void xor_vis_5(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *, unsigned long *); +void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void xor_vis_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +void xor_vis_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); /* XXX Ugh, write cheetah versions... -DaveM */ @@ -30,13 +37,20 @@ static struct xor_block_template xor_block_VIS = { .do_5 = xor_vis_5, }; -void xor_niagara_2(unsigned long, unsigned long *, unsigned long *); -void xor_niagara_3(unsigned long, unsigned long *, unsigned long *, - unsigned long *); -void xor_niagara_4(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *); -void xor_niagara_5(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *, unsigned long *); +void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void xor_niagara_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +void xor_niagara_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); static struct xor_block_template xor_block_niagara = { .name = "Niagara", diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 2ee95a7769e6..7b0307acc410 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -57,7 +57,8 @@ op(i + 3, 3) static void -xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned long lines = bytes >> 8; @@ -108,7 +109,8 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned long lines = bytes >> 8; @@ -142,8 +144,9 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned long lines = bytes >> 8; @@ -201,8 +204,9 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned long lines = bytes >> 8; @@ -238,8 +242,10 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned long lines = bytes >> 8; @@ -304,8 +310,10 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned long lines = bytes >> 8; @@ -343,8 +351,11 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { unsigned long lines = bytes >> 8; @@ -416,8 +427,11 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { unsigned long lines = bytes >> 8; diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 67ceb790e639..7a6b9474591e 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -21,7 +21,8 @@ #include static void -xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned long lines = bytes >> 7; @@ -64,8 +65,9 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned long lines = bytes >> 7; @@ -113,8 +115,10 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned long lines = bytes >> 7; @@ -168,8 +172,11 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, static void -xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { unsigned long lines = bytes >> 7; @@ -248,7 +255,8 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, #undef BLOCK static void -xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned long lines = bytes >> 6; @@ -295,8 +303,9 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned long lines = bytes >> 6; @@ -352,8 +361,10 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned long lines = bytes >> 6; @@ -418,8 +429,11 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { unsigned long lines = bytes >> 6; diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 0c4e5b5e3852..7f81dd5897f4 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -26,7 +26,8 @@ BLOCK4(8) \ BLOCK4(12) -static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, + const unsigned long * __restrict p1) { unsigned long lines = bytes >> 9; @@ -52,8 +53,9 @@ do { \ kernel_fpu_end(); } -static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, - unsigned long *p2) +static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, + const unsigned long * __restrict p1, + const unsigned long * __restrict p2) { unsigned long lines = bytes >> 9; @@ -82,8 +84,10 @@ do { \ kernel_fpu_end(); } -static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, - unsigned long *p2, unsigned long *p3) +static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, + const unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { unsigned long lines = bytes >> 9; @@ -115,8 +119,11 @@ do { \ kernel_fpu_end(); } -static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, - unsigned long *p2, unsigned long *p3, unsigned long *p4) +static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, + const unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { unsigned long lines = bytes >> 9; diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h index b62a2a56a4d4..44509d48fca2 100644 --- a/include/asm-generic/xor.h +++ b/include/asm-generic/xor.h @@ -8,7 +8,8 @@ #include static void -xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { long lines = bytes / (sizeof (long)) / 8; @@ -27,8 +28,9 @@ xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { long lines = bytes / (sizeof (long)) / 8; @@ -48,8 +50,10 @@ xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { long lines = bytes / (sizeof (long)) / 8; @@ -70,8 +74,11 @@ xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { long lines = bytes / (sizeof (long)) / 8; @@ -93,7 +100,8 @@ xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { long lines = bytes / (sizeof (long)) / 8; @@ -129,8 +137,9 @@ xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { long lines = bytes / (sizeof (long)) / 8; @@ -175,8 +184,10 @@ xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { long lines = bytes / (sizeof (long)) / 8; @@ -230,8 +241,11 @@ xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { long lines = bytes / (sizeof (long)) / 8; @@ -294,7 +308,8 @@ xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { long lines = bytes / (sizeof (long)) / 8 - 1; prefetchw(p1); @@ -320,8 +335,9 @@ xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { long lines = bytes / (sizeof (long)) / 8 - 1; prefetchw(p1); @@ -350,8 +366,10 @@ xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { long lines = bytes / (sizeof (long)) / 8 - 1; @@ -384,8 +402,11 @@ xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { long lines = bytes / (sizeof (long)) / 8 - 1; @@ -421,7 +442,8 @@ xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { long lines = bytes / (sizeof (long)) / 8 - 1; @@ -466,8 +488,9 @@ xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) } static void -xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { long lines = bytes / (sizeof (long)) / 8 - 1; @@ -523,8 +546,10 @@ xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { long lines = bytes / (sizeof (long)) / 8 - 1; @@ -591,8 +616,11 @@ xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, } static void -xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { long lines = bytes / (sizeof (long)) / 8 - 1; diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 2a9fee8ddae3..51b811b62322 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -11,13 +11,20 @@ struct xor_block_template { struct xor_block_template *next; const char *name; int speed; - void (*do_2)(unsigned long, unsigned long *, unsigned long *); - void (*do_3)(unsigned long, unsigned long *, unsigned long *, - unsigned long *); - void (*do_4)(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *); - void (*do_5)(unsigned long, unsigned long *, unsigned long *, - unsigned long *, unsigned long *, unsigned long *); + void (*do_2)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_3)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_4)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_5)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); }; #endif -- cgit v1.2.3 From f222ab83df92acf72691a2021e1f0d99880dcdf1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Dec 2021 11:07:40 +0000 Subject: powerpc: Add set_memory_{p/np}() and remove set_memory_attr() set_memory_attr() was implemented by commit 4d1755b6a762 ("powerpc/mm: implement set_memory_attr()") because the set_memory_xx() couldn't be used at that time to modify memory "on the fly" as explained it the commit. But set_memory_attr() uses set_pte_at() which leads to warnings when CONFIG_DEBUG_VM is selected, because set_pte_at() is unexpected for updating existing page table entries. The check could be bypassed by using __set_pte_at() instead, as it was the case before commit c988cfd38e48 ("powerpc/32: use set_memory_attr()") but since commit 9f7853d7609d ("powerpc/mm: Fix set_memory_*() against concurrent accesses") it is now possible to use set_memory_xx() functions to update page table entries "on the fly" because the update is now atomic. For DEBUG_PAGEALLOC we need to clear and set back _PAGE_PRESENT. Add set_memory_np() and set_memory_p() for that. Replace all uses of set_memory_attr() by the relevant set_memory_xx() and remove set_memory_attr(). Fixes: c988cfd38e48 ("powerpc/32: use set_memory_attr()") Cc: stable@vger.kernel.org Reported-by: Maxime Bizon Signed-off-by: Christophe Leroy Tested-by: Maxime Bizon Reviewed-by: Russell Currey Depends-on: 9f7853d7609d ("powerpc/mm: Fix set_memory_*() against concurrent accesses") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cda2b44b55c96f9ac69fa92e68c01084ec9495c5.1640344012.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/set_memory.h | 12 ++++++++++- arch/powerpc/mm/pageattr.c | 39 ++++++----------------------------- arch/powerpc/mm/pgtable_32.c | 24 ++++++++++----------- 3 files changed, 28 insertions(+), 47 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/set_memory.h b/arch/powerpc/include/asm/set_memory.h index b040094f7920..7ebc807aa8cc 100644 --- a/arch/powerpc/include/asm/set_memory.h +++ b/arch/powerpc/include/asm/set_memory.h @@ -6,6 +6,8 @@ #define SET_MEMORY_RW 1 #define SET_MEMORY_NX 2 #define SET_MEMORY_X 3 +#define SET_MEMORY_NP 4 /* Set memory non present */ +#define SET_MEMORY_P 5 /* Set memory present */ int change_memory_attr(unsigned long addr, int numpages, long action); @@ -29,6 +31,14 @@ static inline int set_memory_x(unsigned long addr, int numpages) return change_memory_attr(addr, numpages, SET_MEMORY_X); } -int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot); +static inline int set_memory_np(unsigned long addr, int numpages) +{ + return change_memory_attr(addr, numpages, SET_MEMORY_NP); +} + +static inline int set_memory_p(unsigned long addr, int numpages) +{ + return change_memory_attr(addr, numpages, SET_MEMORY_P); +} #endif diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c index 8812454e70ff..85753e32a4de 100644 --- a/arch/powerpc/mm/pageattr.c +++ b/arch/powerpc/mm/pageattr.c @@ -46,6 +46,12 @@ static int change_page_attr(pte_t *ptep, unsigned long addr, void *data) case SET_MEMORY_X: pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX); break; + case SET_MEMORY_NP: + pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0); + break; + case SET_MEMORY_P: + pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0); + break; default: WARN_ON_ONCE(1); break; @@ -90,36 +96,3 @@ int change_memory_attr(unsigned long addr, int numpages, long action) return apply_to_existing_page_range(&init_mm, start, size, change_page_attr, (void *)action); } - -/* - * Set the attributes of a page: - * - * This function is used by PPC32 at the end of init to set final kernel memory - * protection. It includes changing the maping of the page it is executing from - * and data pages it is using. - */ -static int set_page_attr(pte_t *ptep, unsigned long addr, void *data) -{ - pgprot_t prot = __pgprot((unsigned long)data); - - spin_lock(&init_mm.page_table_lock); - - set_pte_at(&init_mm, addr, ptep, pte_modify(*ptep, prot)); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - spin_unlock(&init_mm.page_table_lock); - - return 0; -} - -int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot) -{ - unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); - unsigned long sz = numpages * PAGE_SIZE; - - if (numpages <= 0) - return 0; - - return apply_to_existing_page_range(&init_mm, start, sz, set_page_attr, - (void *)pgprot_val(prot)); -} diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 906e4e4328b2..f71ededdc02a 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -135,10 +135,12 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - if (v_block_mapped((unsigned long)_sinittext)) + if (v_block_mapped((unsigned long)_sinittext)) { mmu_mark_initmem_nx(); - else - set_memory_attr((unsigned long)_sinittext, numpages, PAGE_KERNEL); + } else { + set_memory_nx((unsigned long)_sinittext, numpages); + set_memory_rw((unsigned long)_sinittext, numpages); + } } #ifdef CONFIG_STRICT_KERNEL_RWX @@ -152,18 +154,14 @@ void mark_rodata_ro(void) return; } - numpages = PFN_UP((unsigned long)_etext) - - PFN_DOWN((unsigned long)_stext); - - set_memory_attr((unsigned long)_stext, numpages, PAGE_KERNEL_ROX); /* - * mark .rodata as read only. Use __init_begin rather than __end_rodata - * to cover NOTES and EXCEPTION_TABLE. + * mark .text and .rodata as read only. Use __init_begin rather than + * __end_rodata to cover NOTES and EXCEPTION_TABLE. */ numpages = PFN_UP((unsigned long)__init_begin) - - PFN_DOWN((unsigned long)__start_rodata); + PFN_DOWN((unsigned long)_stext); - set_memory_attr((unsigned long)__start_rodata, numpages, PAGE_KERNEL_RO); + set_memory_ro((unsigned long)_stext, numpages); // mark_initmem_nx() should have already run by now ptdump_check_wx(); @@ -179,8 +177,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) return; if (enable) - set_memory_attr(addr, numpages, PAGE_KERNEL); + set_memory_p(addr, numpages); else - set_memory_attr(addr, numpages, __pgprot(0)); + set_memory_np(addr, numpages); } #endif /* CONFIG_DEBUG_PAGEALLOC */ -- cgit v1.2.3 From f061fb03ee611c5657010ee4fa2a3fa64dfe3bd0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Jan 2022 16:30:21 +0000 Subject: powerpc/vdso: augment VDSO32 functions to support 64 bits build VDSO64 cacheflush.S datapage.S gettimeofday.S and vgettimeofday.c are very similar to their VDSO32 counterpart. VDSO32 counterpart is already more complete than the VDSO64 version as it supports both PPC32 vdso and 32 bits VDSO for PPC64. Use compat macros wherever necessary in PPC32 files so that they can also be used to build VDSO64. vdso64/note.S is already a link to vdso32/note.S so no change is required. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c2cbb8f046b7efc251053521dc39b752795e26b7.1642782130.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-compat.h | 2 ++ arch/powerpc/kernel/vdso32/cacheflush.S | 4 ++-- arch/powerpc/kernel/vdso32/datapage.S | 10 ++++++++-- arch/powerpc/kernel/vdso32/getcpu.S | 4 ++-- arch/powerpc/kernel/vdso32/gettimeofday.S | 8 ++++++-- arch/powerpc/kernel/vdso32/vgettimeofday.c | 23 +++++++++++++++++++---- 6 files changed, 39 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 2b736d9fbb1b..2bc53c646ccd 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -21,6 +21,7 @@ #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) #define PPC_MTOCRF(FXM, RS) MTOCRF((FXM), RS) +#define PPC_SRL stringify_in_c(srd) #define PPC_LR_STKOFF 16 #define PPC_MIN_STKFRM 112 @@ -54,6 +55,7 @@ #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) +#define PPC_SRL stringify_in_c(srw) #define PPC_LR_STKOFF 4 #define PPC_MIN_STKFRM 16 diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S index f340e82d1981..d4e43ab2d5df 100644 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -46,7 +46,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) add r8,r8,r5 /* ensure we get enough */ #ifdef CONFIG_PPC64 lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ + PPC_SRL. r8,r8,r9 /* compute line count */ #else srwi. r8, r8, L1_CACHE_SHIFT mr r7, r6 @@ -72,7 +72,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ + PPC_SRL. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ #endif diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 65244416ab94..db8e167f0166 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -30,11 +30,15 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mr. r4,r3 get_datapage r3 mtlr r12 +#ifdef __powerpc64__ + addi r3,r3,CFG_SYSCALL_MAP64 +#else addi r3,r3,CFG_SYSCALL_MAP32 +#endif + crclr cr0*4+so beqlr li r0,NR_syscalls stw r0,0(r4) - crclr cr0*4+so blr .cfi_endproc V_FUNCTION_END(__kernel_get_syscall_map) @@ -49,8 +53,10 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) mflr r12 .cfi_register lr,r12 get_datapage r3 +#ifndef __powerpc64__ lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) - lwz r3,CFG_TB_TICKS_PER_SEC(r3) +#endif + PPC_LL r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 crclr cr0*4+so blr diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S index ff5e214fec41..8e08ccf19062 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -19,8 +19,8 @@ V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc mfspr r5,SPRN_SPRG_VDSO_READ - cmpwi cr0,r3,0 - cmpwi cr1,r4,0 + PPC_LCMPI cr0,r3,0 + PPC_LCMPI cr1,r4,0 clrlwi r6,r5,16 rlwinm r7,r5,16,31-15,31-0 beq cr0,1f diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index d21d08140a5e..c875312274aa 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Userland implementation of gettimeofday() for 32 bits processes in a - * ppc64 kernel for use in the vDSO + * Userland implementation of gettimeofday() for processes + * for use in the vDSO * * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, * IBM Corp. @@ -41,9 +41,11 @@ V_FUNCTION_END(__kernel_clock_gettime) * int __kernel_clock_gettime64(clockid_t clock_id, struct __timespec64 *ts); * */ +#ifndef __powerpc64__ V_FUNCTION_BEGIN(__kernel_clock_gettime64) cvdso_call __c_kernel_clock_gettime64 V_FUNCTION_END(__kernel_clock_gettime64) +#endif /* * Exact prototype of clock_getres() @@ -69,6 +71,7 @@ V_FUNCTION_END(__kernel_time) /* Routines for restoring integer registers, called by the compiler. */ /* Called with r11 pointing to the stack header word of the caller of the */ /* function, just beyond the end of the integer restore area. */ +#ifndef __powerpc64__ _GLOBAL(_restgpr_31_x) _GLOBAL(_rest32gpr_31_x) lwz r0,4(r11) @@ -76,3 +79,4 @@ _GLOBAL(_rest32gpr_31_x) mtlr r0 mr r1,r11 blr +#endif diff --git a/arch/powerpc/kernel/vdso32/vgettimeofday.c b/arch/powerpc/kernel/vdso32/vgettimeofday.c index 65fb03fb1731..55a287c9a736 100644 --- a/arch/powerpc/kernel/vdso32/vgettimeofday.c +++ b/arch/powerpc/kernel/vdso32/vgettimeofday.c @@ -2,8 +2,22 @@ /* * Powerpc userspace implementations of gettimeofday() and similar. */ +#include #include +#ifdef __powerpc64__ +int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, + const struct vdso_data *vd) +{ + return __cvdso_clock_gettime_data(vd, clock, ts); +} + +int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, + const struct vdso_data *vd) +{ + return __cvdso_clock_getres_data(vd, clock_id, res); +} +#else int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, const struct vdso_data *vd) { @@ -16,16 +30,17 @@ int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, return __cvdso_clock_gettime_data(vd, clock, ts); } -int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, +int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, const struct vdso_data *vd) { - return __cvdso_gettimeofday_data(vd, tv, tz); + return __cvdso_clock_getres_time32_data(vd, clock_id, res); } +#endif -int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, +int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, const struct vdso_data *vd) { - return __cvdso_clock_getres_time32_data(vd, clock_id, res); + return __cvdso_gettimeofday_data(vd, tv, tz); } __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd) -- cgit v1.2.3 From 9b97bea90072a075363a200dd7b54ad4a24e9491 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Jan 2022 16:30:30 +0000 Subject: powerpc/vdso: Remove cvdso_call_time macro cvdso_call_time macro is very similar to cvdso_call macro. Add a call_time argument to cvdso_call which is 0 by default and set to 1 when using cvdso_call to call __c_kernel_time(). Return returned value as is with CR[SO] cleared when it is used for time(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/837a260ad86fc1ce297a562c2117fd69be5f7b5c.1642782130.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/gettimeofday.h | 37 ++++++++-------------------- arch/powerpc/kernel/vdso/gettimeofday.S | 2 +- 2 files changed, 11 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 1faff0be1111..df00e91c9a90 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -9,12 +9,12 @@ #include /* - * The macros sets two stack frames, one for the caller and one for the callee + * The macro sets two stack frames, one for the caller and one for the callee * because there are no requirement for the caller to set a stack frame when * calling VDSO so it may have omitted to set one, especially on PPC64 */ -.macro cvdso_call funct +.macro cvdso_call funct call_time=0 .cfi_startproc PPC_STLU r1, -PPC_MIN_STKFRM(r1) mflr r0 @@ -25,45 +25,28 @@ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) #endif get_datapage r5 + .ifeq \call_time addi r5, r5, VDSO_DATA_OFFSET + .else + addi r4, r5, VDSO_DATA_OFFSET + .endif bl DOTSYM(\funct) PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) #endif + .ifeq \call_time cmpwi r3, 0 + .endif mtlr r0 .cfi_restore lr addi r1, r1, 2 * PPC_MIN_STKFRM crclr so + .ifeq \call_time beqlr+ crset so neg r3, r3 - blr - .cfi_endproc -.endm - -.macro cvdso_call_time funct - .cfi_startproc - PPC_STLU r1, -PPC_MIN_STKFRM(r1) - mflr r0 - .cfi_register lr, r0 - PPC_STLU r1, -PPC_MIN_STKFRM(r1) - PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) -#ifdef __powerpc64__ - PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) -#endif - get_datapage r4 - addi r4, r4, VDSO_DATA_OFFSET - bl DOTSYM(\funct) - PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) -#ifdef __powerpc64__ - PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) -#endif - crclr so - mtlr r0 - .cfi_restore lr - addi r1, r1, 2 * PPC_MIN_STKFRM + .endif blr .cfi_endproc .endm diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index c875312274aa..397f290015bc 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -65,7 +65,7 @@ V_FUNCTION_END(__kernel_clock_getres) * */ V_FUNCTION_BEGIN(__kernel_time) - cvdso_call_time __c_kernel_time + cvdso_call __c_kernel_time call_time=1 V_FUNCTION_END(__kernel_time) /* Routines for restoring integer registers, called by the compiler. */ -- cgit v1.2.3 From 692b21d78046851e75dc25bba773189c670b49c2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Jan 2022 16:30:34 +0000 Subject: powerpc/vdso: Move cvdso_call macro into gettimeofday.S Now that gettimeofday.S is unique, move cvdso_call macro into that file which is the only user. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/72720359d4c58e3a3b96dd74952741225faac3de.1642782130.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/gettimeofday.h | 52 ++-------------------------- arch/powerpc/kernel/vdso/gettimeofday.S | 44 ++++++++++++++++++++++- 2 files changed, 45 insertions(+), 51 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index df00e91c9a90..f0a4cf01e85c 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -2,57 +2,9 @@ #ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H #define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H -#include - -#ifdef __ASSEMBLY__ - -#include - -/* - * The macro sets two stack frames, one for the caller and one for the callee - * because there are no requirement for the caller to set a stack frame when - * calling VDSO so it may have omitted to set one, especially on PPC64 - */ - -.macro cvdso_call funct call_time=0 - .cfi_startproc - PPC_STLU r1, -PPC_MIN_STKFRM(r1) - mflr r0 - .cfi_register lr, r0 - PPC_STLU r1, -PPC_MIN_STKFRM(r1) - PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) -#ifdef __powerpc64__ - PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) -#endif - get_datapage r5 - .ifeq \call_time - addi r5, r5, VDSO_DATA_OFFSET - .else - addi r4, r5, VDSO_DATA_OFFSET - .endif - bl DOTSYM(\funct) - PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) -#ifdef __powerpc64__ - PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) -#endif - .ifeq \call_time - cmpwi r3, 0 - .endif - mtlr r0 - .cfi_restore lr - addi r1, r1, 2 * PPC_MIN_STKFRM - crclr so - .ifeq \call_time - beqlr+ - crset so - neg r3, r3 - .endif - blr - .cfi_endproc -.endm - -#else +#ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index 397f290015bc..eb9c81e1c218 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -12,7 +12,49 @@ #include #include #include -#include + +/* + * The macro sets two stack frames, one for the caller and one for the callee + * because there are no requirement for the caller to set a stack frame when + * calling VDSO so it may have omitted to set one, especially on PPC64 + */ + +.macro cvdso_call funct call_time=0 + .cfi_startproc + PPC_STLU r1, -PPC_MIN_STKFRM(r1) + mflr r0 + .cfi_register lr, r0 + PPC_STLU r1, -PPC_MIN_STKFRM(r1) + PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) +#ifdef __powerpc64__ + PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) +#endif + get_datapage r5 + .ifeq \call_time + addi r5, r5, VDSO_DATA_OFFSET + .else + addi r4, r5, VDSO_DATA_OFFSET + .endif + bl DOTSYM(\funct) + PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) +#ifdef __powerpc64__ + PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) +#endif + .ifeq \call_time + cmpwi r3, 0 + .endif + mtlr r0 + .cfi_restore lr + addi r1, r1, 2 * PPC_MIN_STKFRM + crclr so + .ifeq \call_time + beqlr+ + crset so + neg r3, r3 + .endif + blr + .cfi_endproc +.endm .text /* -- cgit v1.2.3 From 92e6dc257bd5771cf8db662e4d371fdb58fcbf43 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 7 Feb 2022 16:12:47 -0600 Subject: powerpc/pseries: make pseries_devicetree_update() static pseries_devicetree_update() has only one call site, in the same file in which it is defined. Make it static. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220207221247.354454-1-nathanl@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 1 - arch/powerpc/platforms/pseries/mobility.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 82e5b055fa2a..00531af17ce0 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -274,7 +274,6 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES extern time64_t last_rtas_event; extern int clobbering_unread_rtas_event(void); -extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); int rtas_syscall_dispatch_ibm_suspend_me(u64 handle); #else diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 85033f392c78..94077fa91959 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -265,7 +265,7 @@ static int add_dt_node(struct device_node *parent_dn, __be32 drc_index) return rc; } -int pseries_devicetree_update(s32 scope) +static int pseries_devicetree_update(s32 scope) { char *rtas_buf; __be32 *data; -- cgit v1.2.3 From 2354ad252b66695be02f4acd18e37bf6264f0464 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Feb 2022 12:22:15 +0530 Subject: powerpc/mm: Update default hugetlb size early commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by mobility") introduced pageblock_order which will be used to group pages better. The kernel now groups pages based on the value of HPAGE_SHIFT. Hence HPAGE_SHIFT should be set before we call set_pageblock_order. set_pageblock_order happens early in the boot and default hugetlb page size should be initialized before that to compute the right pageblock_order value. Currently, default hugetlbe page size is set via arch_initcalls which happens late in the boot as shown via the below callstack: [c000000007383b10] [c000000001289328] hugetlbpage_init+0x2b8/0x2f8 [c000000007383bc0] [c0000000012749e4] do_one_initcall+0x14c/0x320 [c000000007383c90] [c00000000127505c] kernel_init_freeable+0x410/0x4e8 [c000000007383da0] [c000000000012664] kernel_init+0x30/0x15c [c000000007383e10] [c00000000000cf14] ret_from_kernel_thread+0x5c/0x64 and the pageblock_order initialization is done early during the boot. [c0000000018bfc80] [c0000000012ae120] set_pageblock_order+0x50/0x64 [c0000000018bfca0] [c0000000012b3d94] sparse_init+0x188/0x268 [c0000000018bfd60] [c000000001288bfc] initmem_init+0x28c/0x328 [c0000000018bfe50] [c00000000127b370] setup_arch+0x410/0x480 [c0000000018bfed0] [c00000000127401c] start_kernel+0xb8/0x934 [c0000000018bff90] [c00000000000d984] start_here_common+0x1c/0x98 delaying default hugetlb page size initialization implies the kernel will initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal value for mobility grouping. IIUC we always had this issue. But it was not a problem for hash translation mode because (MAX_ORDER - 1) is the same as HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix, HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be 5 instead of 8. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220211065215.101767-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/hugetlb.h | 5 ++++- arch/powerpc/mm/book3s64/hugetlbpage.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 5 +---- arch/powerpc/mm/init_64.c | 4 ++++ 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 962708fa1017..6a1a1ac5743b 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -15,7 +15,7 @@ extern bool hugetlb_disabled; -void __init hugetlbpage_init_default(void); +void __init hugetlbpage_init_defaultsize(void); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); @@ -76,6 +76,9 @@ static inline void __init gigantic_hugetlb_cma_reserve(void) { } +static inline void __init hugetlbpage_init_defaultsize(void) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _ASM_POWERPC_HUGETLB_H */ diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index ea8f83afb0ae..3bc0eb21b2a0 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -150,7 +150,7 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } -void __init hugetlbpage_init_default(void) +void __init hugetlbpage_init_defaultsize(void) { /* Set default large page size. Currently, we pick 16M or 1M * depending on what is available diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index ddead41e2194..b642a5a8668f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -664,10 +664,7 @@ static int __init hugetlbpage_init(void) configured = true; } - if (configured) { - if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) - hugetlbpage_init_default(); - } else + if (!configured) pr_info("Failed to initialize. Disabling HugeTLB"); return 0; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 35f46bf54281..83c0ee9fbf05 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -59,6 +59,7 @@ #include #include #include +#include #include @@ -513,6 +514,9 @@ void __init mmu_early_init_devtree(void) } else hash__early_init_devtree(); + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_defaultsize(); + if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) panic("kernel does not support any MMU type offered by platform"); -- cgit v1.2.3 From 5a72345e6a78120368fcc841b570331b6c5a50da Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 11 Feb 2022 17:32:37 +1100 Subject: powerpc: Fix STACKTRACE=n build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our skiroot_defconfig doesn't enable FTRACE, and so doesn't get STACKTRACE enabled either. That leads to a build failure since commit 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") made stacktrace.c build even when STACKTRACE=n. arch/powerpc/kernel/stacktrace.c: In function ‘handle_backtrace_ipi’: arch/powerpc/kernel/stacktrace.c:171:2: error: implicit declaration of function ‘nmi_cpu_backtrace’ 171 | nmi_cpu_backtrace(regs); | ^~~~~~~~~~~~~~~~~ arch/powerpc/kernel/stacktrace.c: In function ‘arch_trigger_cpumask_backtrace’: arch/powerpc/kernel/stacktrace.c:226:2: error: implicit declaration of function ‘nmi_trigger_cpumask_backtrace’ 226 | nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This happens because our headers haven't defined arch_trigger_cpumask_backtrace, which causes lib/nmi_backtrace.c not to build nmi_cpu_backtrace(). The code in question doesn't actually depend on STACKTRACE=y, that was just added because arch_trigger_cpumask_backtrace() lived in stacktrace.c for convenience. So drop the dependency on CONFIG_STACKTRACE, that causes lib/nmi_backtrace.c to build nmi_cpu_backtrace() etc. and fixes the build. Fixes: 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220212111349.2806972-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 160abcb8e9fa..ea0e487f87b1 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -9,7 +9,7 @@ long soft_nmi_interrupt(struct pt_regs *regs); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) +#ifdef CONFIG_NMI_IPI extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace -- cgit v1.2.3 From 38a1756861b8fc2ea9afb93e231194c642a4e261 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 13 Feb 2022 10:02:41 +0100 Subject: powerpc: Don't allow the use of EMIT_BUG_ENTRY with BUGFLAG_WARNING Warnings in assembly must use EMIT_WARN_ENTRY in order to generate the necessary entry in exception table. Check in EMIT_BUG_ENTRY that flags don't include BUGFLAG_WARNING. This change avoids problems like the one fixed by commit fd1eaaaaa686 ("powerpc/64s: Use EMIT_WARN_ENTRY for SRR debug warnings"). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ddcb422102a37eb45f57694c7ef0ec6187964dff.1644742951.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bug.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 02c08d1492f8..ecbae1832de3 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -11,7 +11,7 @@ #ifdef __ASSEMBLY__ #include #ifdef CONFIG_DEBUG_BUGVERBOSE -.macro EMIT_BUG_ENTRY addr,file,line,flags +.macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" 5001: .4byte \addr - 5001b, 5002f - 5001b .short \line, \flags @@ -22,7 +22,7 @@ .previous .endm #else -.macro EMIT_BUG_ENTRY addr,file,line,flags +.macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" 5001: .4byte \addr - 5001b .short \flags @@ -33,7 +33,14 @@ .macro EMIT_WARN_ENTRY addr,file,line,flags EX_TABLE(\addr,\addr+4) - EMIT_BUG_ENTRY \addr,\file,\line,\flags + __EMIT_BUG_ENTRY \addr,\file,\line,\flags +.endm + +.macro EMIT_BUG_ENTRY addr,file,line,flags + .if \flags & 1 /* BUGFLAG_WARNING */ + .err /* Use EMIT_WARN_ENTRY for warnings */ + .endif + __EMIT_BUG_ENTRY \addr,\file,\line,\flags .endm #else /* !__ASSEMBLY__ */ -- cgit v1.2.3 From fc75f87337983229b7355d6b77f30fb6e7f359ee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 19:31:24 +0100 Subject: powerpc/ftrace: Have arch_ftrace_get_regs() return NULL unless FL_SAVE_REGS is set When FL_SAVE_REGS is not set we get here via ftrace_caller() which doesn't save all registers. ftrace_caller() explicitely clears regs.msr, so we can rely on it to know where we come from. We don't expect MSR register to be 0 at all when involving ftrace. Fixes: 40b035efe288 ("powerpc/ftrace: Implement CONFIG_DYNAMIC_FTRACE_WITH_ARGS") Reported-by: Naveen N. Rao Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2f9a7e898c93cc7438ef5ccd47cb9c3a9c5b53ef.1644949750.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ftrace.h | 3 ++- arch/powerpc/kernel/trace/ftrace_mprofile.S | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 70b457097098..ff034ae4e472 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -30,7 +30,8 @@ struct ftrace_regs { static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - return &fregs->regs; + /* We clear regs.msr in ftrace_call */ + return fregs->regs.msr ? &fregs->regs : NULL; } static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S b/arch/powerpc/kernel/trace/ftrace_mprofile.S index 8443902d5a05..eb077270ec2f 100644 --- a/arch/powerpc/kernel/trace/ftrace_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S @@ -205,6 +205,10 @@ _GLOBAL(ftrace_caller) PPC_STL r0, _LINK(r1) mr r4, r0 + /* Clear MSR to flag as ftrace_caller versus frace_regs_caller */ + li r8, 0 + PPC_STL r8, _MSR(r1) + /* Load &pt_regs in r6 for call below */ addi r6, r1 ,STACK_FRAME_OVERHEAD -- cgit v1.2.3 From bbbca72352bb9484bc057c91a408332b35ee8f4c Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 25 Jan 2022 01:52:04 +0530 Subject: powerpc/papr_scm: Implement initial support for injecting smart errors Presently PAPR doesn't support injecting smart errors on an NVDIMM. This makes testing the NVDIMM health reporting functionality difficult as simulating NVDIMM health related events need a hacked up qemu version. To solve this problem this patch proposes simulating certain set of NVDIMM health related events in papr_scm. Specifically 'fatal' health state and 'dirty' shutdown state. These error can be injected via the user-space 'ndctl-inject-smart(1)' command. With the proposed patch and corresponding ndctl patches following command flow is expected: $ sudo ndctl list -DH -d nmem0 ... "health_state":"ok", "shutdown_state":"clean", ... # inject unsafe shutdown and fatal health error $ sudo ndctl inject-smart nmem0 -Uf ... "health_state":"fatal", "shutdown_state":"dirty", ... # uninject all errors $ sudo ndctl inject-smart nmem0 -N ... "health_state":"ok", "shutdown_state":"clean", ... The patch adds a new member 'health_bitmap_inject_mask' inside struct papr_scm_priv which is then bitwise ANDed to the health bitmap fetched from the hypervisor. The value for 'health_bitmap_inject_mask' is accessible from sysfs at nmemX/papr/health_bitmap_inject. A new PDSM named 'SMART_INJECT' is proposed that accepts newly introduced 'struct nd_papr_pdsm_smart_inject' as payload thats exchanged between libndctl and papr_scm to indicate the requested smart-error states. When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject() constructs a pair or 'inject_mask' and 'clear_mask' bitmaps from the payload and bit-blt it to the 'health_bitmap_inject_mask'. This ensures the after being fetched from the hypervisor, the health_bitmap reflects requested smart-error states. Signed-off-by: Vaibhav Jain Signed-off-by: Shivaprasad G Bhat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220124202204.1488346-1-vaibhav@linux.ibm.com --- Documentation/ABI/testing/sysfs-bus-papr-pmem | 12 ++++ arch/powerpc/include/uapi/asm/papr_pdsm.h | 18 ++++++ arch/powerpc/platforms/pseries/papr_scm.c | 90 ++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem index 95254cec92bf..4ac0673901e7 100644 --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem @@ -61,3 +61,15 @@ Description: * "CchRHCnt" : Cache Read Hit Count * "CchWHCnt" : Cache Write Hit Count * "FastWCnt" : Fast Write Count + +What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject +Date: Jan, 2022 +KernelVersion: v5.17 +Contact: linuxppc-dev , nvdimm@lists.linux.dev, +Description: + (RO) Reports the health bitmap inject bitmap that is applied to + bitmap received from PowerVM via the H_SCM_HEALTH. This is used + to forcibly set specific bits returned from Hcall. These is then + used to simulate various health or shutdown states for an nvdimm + and are set by user-space tools like ndctl by issuing a PAPR DSM. + diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h index 82488b1e7276..17439925045c 100644 --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h @@ -116,6 +116,22 @@ struct nd_papr_pdsm_health { }; }; +/* Flags for injecting specific smart errors */ +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) + +struct nd_papr_pdsm_smart_inject { + union { + struct { + /* One or more of PDSM_SMART_INJECT_ */ + __u32 flags; + __u8 fatal_enable; + __u8 unsafe_shutdown_enable; + }; + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; + }; +}; + /* * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel * via 'nd_cmd_pkg.nd_command' member of the ioctl struct @@ -123,12 +139,14 @@ struct nd_papr_pdsm_health { enum papr_pdsm { PAPR_PDSM_MIN = 0x0, PAPR_PDSM_HEALTH, + PAPR_PDSM_SMART_INJECT, PAPR_PDSM_MAX, }; /* Maximal union that can hold all possible payload types */ union nd_pdsm_payload { struct nd_papr_pdsm_health health; + struct nd_papr_pdsm_smart_inject smart_inject; __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; } __packed; diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f48e87ac89c9..20aafd387840 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -120,6 +120,10 @@ struct papr_scm_priv { /* length of the stat buffer as expected by phyp */ size_t stat_buffer_len; + + /* The bits which needs to be overridden */ + u64 health_bitmap_inject_mask; + }; static int papr_scm_pmem_flush(struct nd_region *nd_region, @@ -347,19 +351,29 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, static int __drc_pmem_query_health(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; + u64 bitmap = 0; long rc; /* issue the hcall */ rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); - if (rc != H_SUCCESS) { + if (rc == H_SUCCESS) + bitmap = ret[0] & ret[1]; + else if (rc == H_FUNCTION) + dev_info_once(&p->pdev->dev, + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); + else { + dev_err(&p->pdev->dev, "Failed to query health information, Err:%ld\n", rc); return -ENXIO; } p->lasthealth_jiffies = jiffies; - p->health_bitmap = ret[0] & ret[1]; - + /* Allow injecting specific health bits via inject mask. */ + if (p->health_bitmap_inject_mask) + bitmap = (bitmap & ~p->health_bitmap_inject_mask) | + p->health_bitmap_inject_mask; + WRITE_ONCE(p->health_bitmap, bitmap); dev_dbg(&p->pdev->dev, "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", ret[0], ret[1]); @@ -669,6 +683,56 @@ out: return rc; } +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc; + u32 supported_flags = 0; + u64 inject_mask = 0, clear_mask = 0; + u64 mask; + + /* Check for individual smart error flags and update inject/clear masks */ + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; + if (payload->smart_inject.fatal_enable) + inject_mask |= PAPR_PMEM_HEALTH_FATAL; + else + clear_mask |= PAPR_PMEM_HEALTH_FATAL; + } + + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; + if (payload->smart_inject.unsafe_shutdown_enable) + inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + else + clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + } + + dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n", + inject_mask, clear_mask); + + /* Prevent concurrent access to dimm health bitmap related members */ + rc = mutex_lock_interruptible(&p->health_mutex); + if (rc) + return rc; + + /* Use inject/clear masks to set health_bitmap_inject_mask */ + mask = READ_ONCE(p->health_bitmap_inject_mask); + mask = (mask & ~clear_mask) | inject_mask; + WRITE_ONCE(p->health_bitmap_inject_mask, mask); + + /* Invalidate cached health bitmap */ + p->lasthealth_jiffies = 0; + + mutex_unlock(&p->health_mutex); + + /* Return the supported flags back to userspace */ + payload->smart_inject.flags = supported_flags; + + return sizeof(struct nd_papr_pdsm_health); +} + /* * 'struct pdsm_cmd_desc' * Identifies supported PDSMs' expected length of in/out payloads @@ -702,6 +766,12 @@ static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = { .size_out = sizeof(struct nd_papr_pdsm_health), .service = papr_pdsm_health, }, + + [PAPR_PDSM_SMART_INJECT] = { + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), + .service = papr_pdsm_smart_inject, + }, /* Empty */ [PAPR_PDSM_MAX] = { .size_in = 0, @@ -838,6 +908,19 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, return 0; } +static ssize_t health_bitmap_inject_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + return sprintf(buf, "%#llx\n", + READ_ONCE(p->health_bitmap_inject_mask)); +} + +static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject); + static ssize_t perf_stats_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -952,6 +1035,7 @@ static struct attribute *papr_nd_attributes[] = { &dev_attr_flags.attr, &dev_attr_perf_stats.attr, &dev_attr_dirty_shutdown.attr, + &dev_attr_health_bitmap_inject.attr, NULL, }; -- cgit v1.2.3 From 5b23cb8cc6b0aab0535253cc2aa362572bab7072 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:40:57 +0100 Subject: powerpc: Move and rename func_descr_t There are three architectures with function descriptors, try to have common names for the address they contain in order to refactor some functions into generic functions later. powerpc has 'entry' ia64 has 'ip' parisc has 'addr' Vote for 'addr' and update 'func_descr_t' accordingly. Move it in asm/elf.h to have it at the same place on all three architectures, remove the typedef which hides its real type, and change it to a smoother name 'struct func_desc'. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Reviewed-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/529b2ba1d001e8f628ef0d30e8044c9b3d0a4921.1644928018.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 2 +- arch/powerpc/include/asm/elf.h | 6 ++++++ arch/powerpc/include/asm/types.h | 6 ------ arch/powerpc/kernel/signal_64.c | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index e26080539c31..409483b2d0ce 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -118,7 +118,7 @@ static inline unsigned long ppc_function_entry(void *func) * function's descriptor. The first entry in the descriptor is the * address of the function text. */ - return ((func_descr_t *)func)->entry; + return ((struct func_desc *)func)->addr; #else return (unsigned long)func; #endif diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index b8425e3cfd81..971589a21bc0 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -176,4 +176,10 @@ do { \ /* Relocate the kernel image to @final_address */ void relocate(unsigned long final_address); +struct func_desc { + unsigned long addr; + unsigned long toc; + unsigned long env; +}; + #endif /* _ASM_POWERPC_ELF_H */ diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index f1630c553efe..97da77bc48c9 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -23,12 +23,6 @@ typedef __vector128 vector128; -typedef struct { - unsigned long entry; - unsigned long toc; - unsigned long env; -} func_descr_t; - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_TYPES_H */ diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index d1e1fc0acbea..73d483b07ff3 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -936,11 +936,11 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, * descriptor is the entry address of signal and the second * entry is the TOC value we need to use. */ - func_descr_t __user *funct_desc_ptr = - (func_descr_t __user *) ksig->ka.sa.sa_handler; + struct func_desc __user *ptr = + (struct func_desc __user *)ksig->ka.sa.sa_handler; - err |= get_user(regs->ctr, &funct_desc_ptr->entry); - err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); + err |= get_user(regs->ctr, &ptr->addr); + err |= get_user(regs->gpr[2], &ptr->toc); } /* enter the signal handler in native-endian mode */ -- cgit v1.2.3 From d3e32b997a4ca2e7be71cb770bcb2c000ee20b36 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:40:58 +0100 Subject: powerpc: Use 'struct func_desc' instead of 'struct ppc64_opd_entry' 'struct ppc64_opd_entry' is somehow redundant with 'struct func_desc', the later is more correct/complete as it includes the third field which is unused. So use 'struct func_desc' instead of 'struct ppc64_opd_entry' Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Reviewed-by: Daniel Axtens Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/34e76bac6cbe95a63ecd37df69fb7feb93b0ea7c.1644928018.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/sections.h | 4 ++-- arch/powerpc/kernel/module_64.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 38f79e42bf3c..baca39f4c6d3 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -61,10 +61,10 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) #undef dereference_function_descriptor static inline void *dereference_function_descriptor(void *ptr) { - struct ppc64_opd_entry *desc = ptr; + struct func_desc *desc = ptr; void *p; - if (!get_kernel_nofault(p, (void *)&desc->funcaddr)) + if (!get_kernel_nofault(p, (void *)&desc->addr)) ptr = p; return ptr; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 5d77d3f5fbb5..d2082f236bc1 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -64,19 +64,19 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym) #else /* An address is address of the OPD entry, which contains address of fn. */ -typedef struct ppc64_opd_entry func_desc_t; +typedef struct func_desc func_desc_t; static func_desc_t func_desc(unsigned long addr) { - return *(struct ppc64_opd_entry *)addr; + return *(struct func_desc *)addr; } static unsigned long func_addr(unsigned long addr) { - return func_desc(addr).funcaddr; + return func_desc(addr).addr; } static unsigned long stub_func_addr(func_desc_t func) { - return func.funcaddr; + return func.addr; } static unsigned int local_entry_offset(const Elf64_Sym *sym) { @@ -187,7 +187,7 @@ static int relacmp(const void *_x, const void *_y) static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, const Elf64_Shdr *sechdrs) { - /* One extra reloc so it's always 0-funcaddr terminated */ + /* One extra reloc so it's always 0-addr terminated */ unsigned long relocs = 1; unsigned i; -- cgit v1.2.3 From 0a9c5ae279c963149df9a84588281d3d607f7a1f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:40:59 +0100 Subject: powerpc: Remove 'struct ppc64_opd_entry' 'struct ppc64_opd_entry' doesn't belong to uapi/asm/elf.h It was initially in module_64.c and commit 2d291e902791 ("Fix compile failure with non modular builds") moved it into asm/elf.h But it was by mistake added outside of __KERNEL__ section, therefore commit c3617f72036c ("UAPI: (Scripted) Disintegrate arch/powerpc/include/asm") moved it to uapi/asm/elf.h Now that it is not used anymore by the kernel, remove it. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c309ccee65ec2e3802df7a7fe761d0a298584809.1644928018.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/uapi/asm/elf.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index 860c59291bfc..308857123a08 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -289,12 +289,4 @@ typedef elf_fpreg_t elf_vsrreghalf_t32[ELF_NVSRHALFREG]; /* Keep this the last entry. */ #define R_PPC64_NUM 253 -/* There's actually a third entry here, but it's unused */ -struct ppc64_opd_entry -{ - unsigned long funcaddr; - unsigned long r2; -}; - - #endif /* _UAPI_ASM_POWERPC_ELF_H */ -- cgit v1.2.3 From a257cacc38718c83cee003487e03197f237f5c3f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:41:02 +0100 Subject: asm-generic: Define CONFIG_HAVE_FUNCTION_DESCRIPTORS Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by a config option named CONFIG_HAVE_FUNCTION_DESCRIPTORS and use it instead of 'dereference_function_descriptor' macro to know whether an arch has function descriptors. To limit churn in one of the following patches, use an #ifdef/#else construct with empty first part instead of an #ifndef in asm-generic/sections.h On powerpc, make sure the config option matches the ABI used by the compiler with a BUILD_BUG_ON() and add missing _CALL_ELF=2 when calling 'sparse' so that sparse sees the same piece of code as GCC. And include a helper to check whether an arch has function descriptors or not : have_function_descriptors() Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Reviewed-by: Nicholas Piggin Acked-by: Helge Deller Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4a0f11fb0ea74a3197bc44dd7ba25e53a24fd03d.1644928018.git.christophe.leroy@csgroup.eu --- arch/Kconfig | 3 +++ arch/ia64/Kconfig | 1 + arch/ia64/include/asm/sections.h | 2 -- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/sections.h | 2 -- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/sections.h | 2 -- arch/powerpc/kernel/ptrace/ptrace.c | 6 ++++++ include/asm-generic/sections.h | 8 +++++++- include/linux/kallsyms.h | 2 +- 10 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/Kconfig b/arch/Kconfig index 678a80713b21..fe24174cb63c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -205,6 +205,9 @@ config HAVE_FUNCTION_ERROR_INJECTION config HAVE_NMI bool +config HAVE_FUNCTION_DESCRIPTORS + bool + config TRACE_IRQFLAGS_SUPPORT bool diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index a7e01573abd8..da85c3b23b16 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -35,6 +35,7 @@ config IA64 select HAVE_SETUP_PER_CPU_AREA select TTY select HAVE_ARCH_TRACEHOOK + select HAVE_FUNCTION_DESCRIPTORS select HAVE_VIRT_CPU_ACCOUNTING select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE select VIRT_TO_BUS diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 35f24e52149a..2460d365a057 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -27,8 +27,6 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1 - #undef dereference_function_descriptor static inline void *dereference_function_descriptor(void *ptr) { diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 43c1c880def6..82e7ab1a9764 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -69,6 +69,7 @@ config PARISC select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS select TRACE_IRQFLAGS_SUPPORT + select HAVE_FUNCTION_DESCRIPTORS if 64BIT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/sections.h b/arch/parisc/include/asm/sections.h index bb52aea0cb21..c8092e4d94de 100644 --- a/arch/parisc/include/asm/sections.h +++ b/arch/parisc/include/asm/sections.h @@ -9,8 +9,6 @@ extern char __alt_instructions[], __alt_instructions_end[]; #ifdef CONFIG_64BIT -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1 - #undef dereference_function_descriptor void *dereference_function_descriptor(void *); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b779603978e1..a0c9cd0bbc85 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -202,6 +202,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_DESCRIPTORS if PPC64 && !CPU_LITTLE_ENDIAN select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index baca39f4c6d3..7728a7a146c3 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -56,8 +56,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) #ifdef PPC64_ELF_ABI_v1 -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1 - #undef dereference_function_descriptor static inline void *dereference_function_descriptor(void *ptr) { diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index c43f77e2ac31..1212a812a7ab 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -445,4 +445,10 @@ void __init pt_regs_check(void) * real registers. */ BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); + +#ifdef PPC64_ELF_ABI_v1 + BUILD_BUG_ON(!IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); +#else + BUILD_BUG_ON(IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); +#endif } diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 690f741764e1..3ef83e1aebee 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -59,11 +59,17 @@ extern char __noinstr_text_start[], __noinstr_text_end[]; extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ -#ifndef dereference_function_descriptor +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +#else #define dereference_function_descriptor(p) ((void *)(p)) #define dereference_kernel_function_descriptor(p) ((void *)(p)) #endif +static inline bool have_function_descriptors(void) +{ + return IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS); +} + /** * memory_contains - checks if an object is contained within a memory region * @begin: virtual address of the beginning of the memory region diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 4176c7eca7b5..ce1bd2fbf23e 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -48,7 +48,7 @@ static inline int is_ksym_addr(unsigned long addr) static inline void *dereference_symbol_descriptor(void *ptr) { -#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); -- cgit v1.2.3 From 0dc690e4ef5b901e9d4b53520854fbd5c749e09d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:41:03 +0100 Subject: asm-generic: Define 'func_desc_t' to commonly describe function descriptors We have three architectures using function descriptors, each with its own type and name. Add a common typedef that can be used in generic code. Also add a stub typedef for architecture without function descriptors, to avoid a forest of #ifdefs. It replaces the similar 'func_desc_t' previously defined in arch/powerpc/kernel/module_64.c Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Acked-by: Arnd Bergmann Acked-by: Helge Deller Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f1f91b142b3c1082bdc1586ce71c9bac1e75213c.1644928018.git.christophe.leroy@csgroup.eu --- arch/ia64/include/asm/sections.h | 3 +++ arch/parisc/include/asm/sections.h | 5 +++++ arch/powerpc/include/asm/sections.h | 4 ++++ arch/powerpc/kernel/module_64.c | 8 -------- include/asm-generic/sections.h | 5 +++++ 5 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 2460d365a057..3abe0562b01a 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -9,6 +9,9 @@ #include #include + +typedef struct fdesc func_desc_t; + #include extern char __phys_per_cpu_start[]; diff --git a/arch/parisc/include/asm/sections.h b/arch/parisc/include/asm/sections.h index c8092e4d94de..ace1d4047a0b 100644 --- a/arch/parisc/include/asm/sections.h +++ b/arch/parisc/include/asm/sections.h @@ -2,6 +2,11 @@ #ifndef _PARISC_SECTIONS_H #define _PARISC_SECTIONS_H +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +#include +typedef Elf64_Fdesc func_desc_t; +#endif + /* nothing to see, move along */ #include diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 7728a7a146c3..fddfb3937868 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +typedef struct func_desc func_desc_t; +#endif + #include extern char __head_end[]; diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index f81bab3eb8e9..0337b46424bc 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -32,11 +32,6 @@ #ifdef PPC64_ELF_ABI_v2 -/* An address is simply the address of the function. */ -typedef struct { - unsigned long addr; -} func_desc_t; - static func_desc_t func_desc(unsigned long addr) { func_desc_t desc = { @@ -61,9 +56,6 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym) } #else -/* An address is address of the OPD entry, which contains address of fn. */ -typedef struct func_desc func_desc_t; - static func_desc_t func_desc(unsigned long addr) { return *(struct func_desc *)addr; diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 3ef83e1aebee..bbf97502470c 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -63,6 +63,11 @@ extern __visible const void __nosave_begin, __nosave_end; #else #define dereference_function_descriptor(p) ((void *)(p)) #define dereference_kernel_function_descriptor(p) ((void *)(p)) + +/* An address is simply the address of the function. */ +typedef struct { + unsigned long addr; +} func_desc_t; #endif static inline bool have_function_descriptors(void) -- cgit v1.2.3 From e1478d8eaf27704db17a44dee4c53696ed01fc9c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:41:04 +0100 Subject: asm-generic: Refactor dereference_[kernel]_function_descriptor() dereference_function_descriptor() and dereference_kernel_function_descriptor() are identical on the three architectures implementing them. Make them common and put them out-of-line in kernel/extable.c which is one of the users and has similar type of functions. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Reviewed-by: Arnd Bergmann Acked-by: Helge Deller Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/449db09b2eba57f4ab05f80102a67d8675bc8bcd.1644928018.git.christophe.leroy@csgroup.eu --- arch/ia64/include/asm/sections.h | 19 ------------------- arch/parisc/include/asm/sections.h | 9 --------- arch/parisc/kernel/process.c | 21 --------------------- arch/powerpc/include/asm/sections.h | 23 ----------------------- include/asm-generic/sections.h | 2 ++ kernel/extable.c | 23 ++++++++++++++++++++++- 6 files changed, 24 insertions(+), 73 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 3abe0562b01a..8e0875cf6071 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -30,23 +30,4 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; -#undef dereference_function_descriptor -static inline void *dereference_function_descriptor(void *ptr) -{ - struct fdesc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -#undef dereference_kernel_function_descriptor -static inline void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) - return ptr; - return dereference_function_descriptor(ptr); -} - #endif /* _ASM_IA64_SECTIONS_H */ diff --git a/arch/parisc/include/asm/sections.h b/arch/parisc/include/asm/sections.h index ace1d4047a0b..33df42b5cc6d 100644 --- a/arch/parisc/include/asm/sections.h +++ b/arch/parisc/include/asm/sections.h @@ -12,13 +12,4 @@ typedef Elf64_Fdesc func_desc_t; extern char __alt_instructions[], __alt_instructions_end[]; -#ifdef CONFIG_64BIT - -#undef dereference_function_descriptor -void *dereference_function_descriptor(void *); - -#undef dereference_kernel_function_descriptor -void *dereference_kernel_function_descriptor(void *); -#endif - #endif diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index ea3d83b6fb62..2030c77592d3 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -263,27 +263,6 @@ __get_wchan(struct task_struct *p) return 0; } -#ifdef CONFIG_64BIT -void *dereference_function_descriptor(void *ptr) -{ - Elf64_Fdesc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || - ptr >= (void *)__end_opd) - return ptr; - - return dereference_function_descriptor(ptr); -} -#endif - static inline unsigned long brk_rnd(void) { return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index fddfb3937868..8be2c491c733 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -58,29 +58,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) (unsigned long)_stext < end; } -#ifdef PPC64_ELF_ABI_v1 - -#undef dereference_function_descriptor -static inline void *dereference_function_descriptor(void *ptr) -{ - struct func_desc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -#undef dereference_kernel_function_descriptor -static inline void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) - return ptr; - - return dereference_function_descriptor(ptr); -} -#endif /* PPC64_ELF_ABI_v1 */ - #endif #endif /* __KERNEL__ */ diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index bbf97502470c..d0f7bdd2fdf2 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -60,6 +60,8 @@ extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr); +void *dereference_kernel_function_descriptor(void *ptr); #else #define dereference_function_descriptor(p) ((void *)(p)) #define dereference_kernel_function_descriptor(p) ((void *)(p)) diff --git a/kernel/extable.c b/kernel/extable.c index b6f330f0fe74..394c39b86e38 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -3,6 +3,7 @@ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. */ +#include #include #include #include @@ -132,12 +133,32 @@ out: } /* - * On some architectures (PPC64, IA64) function pointers + * On some architectures (PPC64, IA64, PARISC) function pointers * are actually only tokens to some data that then holds the * real function address. As a result, to find if a function * pointer is part of the kernel text, we need to do some * special dereferencing first. */ +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr) +{ + func_desc_t *desc = ptr; + void *p; + + if (!get_kernel_nofault(p, (void *)&desc->addr)) + ptr = p; + return ptr; +} + +void *dereference_kernel_function_descriptor(void *ptr) +{ + if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) + return ptr; + + return dereference_function_descriptor(ptr); +} +#endif + int func_ptr_is_kernel_text(void *ptr) { unsigned long addr; -- cgit v1.2.3 From 72113d0a7d90d950c7c9a87ab905bffb6bc5752d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 10 Feb 2022 11:11:24 +0900 Subject: signal.h: add linux/signal.h and asm/signal.h to UAPI compile-test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit linux/signal.h and asm/signal.h are currently excluded from the UAPI compile-test because of the errors like follows: HDRTEST usr/include/asm/signal.h In file included from : ./usr/include/asm/signal.h:103:9: error: unknown type name ‘size_t’ 103 | size_t ss_size; | ^~~~~~ The errors can be fixed by replacing size_t with __kernel_size_t. Then, remove the no-header-test entries from user/include/Makefile. Signed-off-by: Masahiro Yamada Reviewed-by: Arnd Bergmann Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann --- arch/alpha/include/uapi/asm/signal.h | 2 +- arch/arm/include/uapi/asm/signal.h | 2 +- arch/h8300/include/uapi/asm/signal.h | 2 +- arch/ia64/include/uapi/asm/signal.h | 2 +- arch/m68k/include/uapi/asm/signal.h | 2 +- arch/mips/include/uapi/asm/signal.h | 2 +- arch/parisc/include/uapi/asm/signal.h | 2 +- arch/powerpc/include/uapi/asm/signal.h | 2 +- arch/s390/include/uapi/asm/signal.h | 2 +- arch/sparc/include/uapi/asm/signal.h | 3 ++- arch/x86/include/uapi/asm/signal.h | 2 +- arch/xtensa/include/uapi/asm/signal.h | 2 +- include/uapi/asm-generic/signal.h | 2 +- usr/include/Makefile | 2 -- 14 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/uapi/asm/signal.h b/arch/alpha/include/uapi/asm/signal.h index a69dd8d080a8..1413075f7616 100644 --- a/arch/alpha/include/uapi/asm/signal.h +++ b/arch/alpha/include/uapi/asm/signal.h @@ -100,7 +100,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; /* sigstack(2) is deprecated, and will be withdrawn in a future version diff --git a/arch/arm/include/uapi/asm/signal.h b/arch/arm/include/uapi/asm/signal.h index c9a3ea1d8d41..9e2178420db2 100644 --- a/arch/arm/include/uapi/asm/signal.h +++ b/arch/arm/include/uapi/asm/signal.h @@ -93,7 +93,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/h8300/include/uapi/asm/signal.h b/arch/h8300/include/uapi/asm/signal.h index 2cd0dce2b6a6..1165481f80f6 100644 --- a/arch/h8300/include/uapi/asm/signal.h +++ b/arch/h8300/include/uapi/asm/signal.h @@ -85,7 +85,7 @@ struct sigaction { typedef struct sigaltstack { void *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/ia64/include/uapi/asm/signal.h b/arch/ia64/include/uapi/asm/signal.h index 38166a88e4c9..63d574e802a2 100644 --- a/arch/ia64/include/uapi/asm/signal.h +++ b/arch/ia64/include/uapi/asm/signal.h @@ -90,7 +90,7 @@ struct siginfo; typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/m68k/include/uapi/asm/signal.h b/arch/m68k/include/uapi/asm/signal.h index 4619291df601..80f520b9b10b 100644 --- a/arch/m68k/include/uapi/asm/signal.h +++ b/arch/m68k/include/uapi/asm/signal.h @@ -83,7 +83,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; #endif /* _UAPI_M68K_SIGNAL_H */ diff --git a/arch/mips/include/uapi/asm/signal.h b/arch/mips/include/uapi/asm/signal.h index e6c78a15cb2f..94a00f82e373 100644 --- a/arch/mips/include/uapi/asm/signal.h +++ b/arch/mips/include/uapi/asm/signal.h @@ -100,7 +100,7 @@ struct sigaction { /* IRIX compatible stack_t */ typedef struct sigaltstack { void __user *ss_sp; - size_t ss_size; + __kernel_size_t ss_size; int ss_flags; } stack_t; diff --git a/arch/parisc/include/uapi/asm/signal.h b/arch/parisc/include/uapi/asm/signal.h index e5a2657477ac..8e4895c5ea5d 100644 --- a/arch/parisc/include/uapi/asm/signal.h +++ b/arch/parisc/include/uapi/asm/signal.h @@ -67,7 +67,7 @@ struct siginfo; typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; #endif /* !__ASSEMBLY */ diff --git a/arch/powerpc/include/uapi/asm/signal.h b/arch/powerpc/include/uapi/asm/signal.h index 04873dd311c2..37d41d87c45b 100644 --- a/arch/powerpc/include/uapi/asm/signal.h +++ b/arch/powerpc/include/uapi/asm/signal.h @@ -86,7 +86,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h index 0189f326aac5..e74d6ba1bd3b 100644 --- a/arch/s390/include/uapi/asm/signal.h +++ b/arch/s390/include/uapi/asm/signal.h @@ -108,7 +108,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/sparc/include/uapi/asm/signal.h b/arch/sparc/include/uapi/asm/signal.h index 53758d53ac0e..d395af9b46d2 100644 --- a/arch/sparc/include/uapi/asm/signal.h +++ b/arch/sparc/include/uapi/asm/signal.h @@ -2,6 +2,7 @@ #ifndef _UAPI__SPARC_SIGNAL_H #define _UAPI__SPARC_SIGNAL_H +#include #include #include @@ -171,7 +172,7 @@ struct __old_sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index 164a22a72984..777c3a0f4e23 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -104,7 +104,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/xtensa/include/uapi/asm/signal.h b/arch/xtensa/include/uapi/asm/signal.h index 79ddabaa4e5d..b8c824dd4b74 100644 --- a/arch/xtensa/include/uapi/asm/signal.h +++ b/arch/xtensa/include/uapi/asm/signal.h @@ -103,7 +103,7 @@ struct sigaction { typedef struct sigaltstack { void *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; #endif /* __ASSEMBLY__ */ diff --git a/include/uapi/asm-generic/signal.h b/include/uapi/asm-generic/signal.h index f634822906e4..0eb69dc8e572 100644 --- a/include/uapi/asm-generic/signal.h +++ b/include/uapi/asm-generic/signal.h @@ -85,7 +85,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; #endif /* __ASSEMBLY__ */ diff --git a/usr/include/Makefile b/usr/include/Makefile index 83822c33e9e7..3150b86748e9 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -21,7 +21,6 @@ override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include # # Sorted alphabetically. no-header-test += asm/shmbuf.h -no-header-test += asm/signal.h no-header-test += asm/ucontext.h no-header-test += drm/vmwgfx_drm.h no-header-test += linux/am437x-vpfe.h @@ -41,7 +40,6 @@ no-header-test += linux/patchkey.h no-header-test += linux/phonet.h no-header-test += linux/reiserfs_xattr.h no-header-test += linux/sctp.h -no-header-test += linux/signal.h no-header-test += linux/sysctl.h no-header-test += linux/usb/audio.h no-header-test += linux/v4l2-mediabus.h -- cgit v1.2.3 From 4a3233c1a69885aa7e71c48ff39ae11c212ac90a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 10 Feb 2022 11:11:25 +0900 Subject: shmbuf.h: add asm/shmbuf.h to UAPI compile-test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit asm/shmbuf.h is currently excluded from the UAPI compile-test because of the errors like follows: HDRTEST usr/include/asm/shmbuf.h In file included from ./usr/include/asm/shmbuf.h:6, from : ./usr/include/asm-generic/shmbuf.h:26:33: error: field ‘shm_perm’ has incomplete type 26 | struct ipc64_perm shm_perm; /* operation perms */ | ^~~~~~~~ ./usr/include/asm-generic/shmbuf.h:27:9: error: unknown type name ‘size_t’ 27 | size_t shm_segsz; /* size of segment (bytes) */ | ^~~~~~ ./usr/include/asm-generic/shmbuf.h:40:9: error: unknown type name ‘__kernel_pid_t’ 40 | __kernel_pid_t shm_cpid; /* pid of creator */ | ^~~~~~~~~~~~~~ ./usr/include/asm-generic/shmbuf.h:41:9: error: unknown type name ‘__kernel_pid_t’ 41 | __kernel_pid_t shm_lpid; /* pid of last operator */ | ^~~~~~~~~~~~~~ The errors can be fixed by replacing size_t with __kernel_size_t and by including proper headers. Then, remove the no-header-test entry from user/include/Makefile. Signed-off-by: Masahiro Yamada Reviewed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann --- arch/mips/include/uapi/asm/shmbuf.h | 7 +++++-- arch/parisc/include/uapi/asm/shmbuf.h | 2 ++ arch/powerpc/include/uapi/asm/shmbuf.h | 5 ++++- arch/sparc/include/uapi/asm/shmbuf.h | 5 ++++- arch/x86/include/uapi/asm/shmbuf.h | 6 +++++- arch/xtensa/include/uapi/asm/shmbuf.h | 5 ++++- include/uapi/asm-generic/shmbuf.h | 4 +++- usr/include/Makefile | 1 - 8 files changed, 27 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/mips/include/uapi/asm/shmbuf.h b/arch/mips/include/uapi/asm/shmbuf.h index 680bb95b2240..eb74d304b779 100644 --- a/arch/mips/include/uapi/asm/shmbuf.h +++ b/arch/mips/include/uapi/asm/shmbuf.h @@ -2,6 +2,9 @@ #ifndef _ASM_SHMBUF_H #define _ASM_SHMBUF_H +#include +#include + /* * The shmid64_ds structure for the MIPS architecture. * Note extra padding because this structure is passed back and forth @@ -16,7 +19,7 @@ #ifdef __mips64 struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ long shm_atime; /* last attach time */ long shm_dtime; /* last detach time */ long shm_ctime; /* last change time */ @@ -29,7 +32,7 @@ struct shmid64_ds { #else struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ unsigned long shm_atime; /* last attach time */ unsigned long shm_dtime; /* last detach time */ unsigned long shm_ctime; /* last change time */ diff --git a/arch/parisc/include/uapi/asm/shmbuf.h b/arch/parisc/include/uapi/asm/shmbuf.h index 5da3089be65e..532da742fb56 100644 --- a/arch/parisc/include/uapi/asm/shmbuf.h +++ b/arch/parisc/include/uapi/asm/shmbuf.h @@ -3,6 +3,8 @@ #define _PARISC_SHMBUF_H #include +#include +#include /* * The shmid64_ds structure for parisc architecture. diff --git a/arch/powerpc/include/uapi/asm/shmbuf.h b/arch/powerpc/include/uapi/asm/shmbuf.h index 00422b2f3c63..439a3a02ba64 100644 --- a/arch/powerpc/include/uapi/asm/shmbuf.h +++ b/arch/powerpc/include/uapi/asm/shmbuf.h @@ -2,6 +2,9 @@ #ifndef _ASM_POWERPC_SHMBUF_H #define _ASM_POWERPC_SHMBUF_H +#include +#include + /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -34,7 +37,7 @@ struct shmid64_ds { unsigned long shm_ctime; /* last change time */ unsigned long __unused4; #endif - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ __kernel_pid_t shm_cpid; /* pid of creator */ __kernel_pid_t shm_lpid; /* pid of last operator */ unsigned long shm_nattch; /* no. of current attaches */ diff --git a/arch/sparc/include/uapi/asm/shmbuf.h b/arch/sparc/include/uapi/asm/shmbuf.h index a5d7d8d681c4..ed4f061c7a15 100644 --- a/arch/sparc/include/uapi/asm/shmbuf.h +++ b/arch/sparc/include/uapi/asm/shmbuf.h @@ -2,6 +2,9 @@ #ifndef _SPARC_SHMBUF_H #define _SPARC_SHMBUF_H +#include +#include + /* * The shmid64_ds structure for sparc architecture. * Note extra padding because this structure is passed back and forth @@ -25,7 +28,7 @@ struct shmid64_ds { unsigned long shm_ctime_high; unsigned long shm_ctime; /* last change time */ #endif - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ __kernel_pid_t shm_cpid; /* pid of creator */ __kernel_pid_t shm_lpid; /* pid of last operator */ unsigned long shm_nattch; /* no. of current attaches */ diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index fce18eaa070c..13775bfdfee2 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -5,6 +5,10 @@ #if !defined(__x86_64__) || !defined(__ILP32__) #include #else + +#include +#include + /* * The shmid64_ds structure for x86 architecture with x32 ABI. * @@ -15,7 +19,7 @@ struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ __kernel_long_t shm_atime; /* last attach time */ __kernel_long_t shm_dtime; /* last detach time */ __kernel_long_t shm_ctime; /* last change time */ diff --git a/arch/xtensa/include/uapi/asm/shmbuf.h b/arch/xtensa/include/uapi/asm/shmbuf.h index 554a57a6a90f..bb8bdddae9b5 100644 --- a/arch/xtensa/include/uapi/asm/shmbuf.h +++ b/arch/xtensa/include/uapi/asm/shmbuf.h @@ -20,9 +20,12 @@ #ifndef _XTENSA_SHMBUF_H #define _XTENSA_SHMBUF_H +#include +#include + struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ unsigned long shm_atime; /* last attach time */ unsigned long shm_atime_high; unsigned long shm_dtime; /* last detach time */ diff --git a/include/uapi/asm-generic/shmbuf.h b/include/uapi/asm-generic/shmbuf.h index 2bab955e0fed..2979b6dd2c56 100644 --- a/include/uapi/asm-generic/shmbuf.h +++ b/include/uapi/asm-generic/shmbuf.h @@ -3,6 +3,8 @@ #define __ASM_GENERIC_SHMBUF_H #include +#include +#include /* * The shmid64_ds structure for x86 architecture. @@ -24,7 +26,7 @@ struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ + __kernel_size_t shm_segsz; /* size of segment (bytes) */ #if __BITS_PER_LONG == 64 long shm_atime; /* last attach time */ long shm_dtime; /* last detach time */ diff --git a/usr/include/Makefile b/usr/include/Makefile index 3150b86748e9..1aa725a3cbbc 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -20,7 +20,6 @@ override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include # Please consider to fix the header first. # # Sorted alphabetically. -no-header-test += asm/shmbuf.h no-header-test += asm/ucontext.h no-header-test += drm/vmwgfx_drm.h no-header-test += linux/am437x-vpfe.h -- cgit v1.2.3 From 406a8c1d8fa59ae6a6462a6fb6ff892f6a4f7499 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Feb 2022 15:05:30 +0100 Subject: powerpc: Remove remaining stab codes Following commit 12318163737c ("powerpc/32: Remove remaining .stabs annotations"), stabs code are not used anymore. Remove them. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d8b33342d7454f6ca4f368f5206896558dfa06f4.1645538722.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 3c06a33b5da4..4dea2d963738 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -692,12 +692,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #define evr30 30 #define evr31 31 -/* some stab codes */ -#define N_FUN 36 -#define N_RSYM 64 -#define N_SLINE 68 -#define N_SO 100 - #define RFSCV .long 0x4c0000a4 /* -- cgit v1.2.3 From 8b91cee5eadd2021f55e6775f2d50bd56d00c217 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 4 Feb 2022 13:53:48 +1000 Subject: powerpc/64s/hash: Make hash faults work in NMI context Hash faults are not resoved in NMI context, instead causing the access to fail. This is done because perf interrupts can get backtraces including walking the user stack, and taking a hash fault on those could deadlock on the HPTE lock if the perf interrupt hits while the same HPTE lock is being held by the hash fault code. The user-access for the stack walking will notice the access failed and deal with that in the perf code. The reason to allow perf interrupts in is to better profile hash faults. The problem with this is any hash fault on a kernel access that happens in NMI context will crash, because kernel accesses must not fail. Hard lockups, system reset, machine checks that access vmalloc space including modules and including stack backtracing and symbol lookup in modules, per-cpu data, etc could all run into this problem. Fix this by disallowing perf interrupts in the hash fault code (the direct hash fault is covered by MSR[EE]=0 so the PMI disable just needs to extend to the preload case). This simplifies the tricky logic in hash faults and perf, at the cost of reduced profiling of hash faults. perf can still latch addresses when interrupts are disabled, it just won't get the stack trace at that point, so it would still find hot spots, just sometimes with confusing stack chains. An alternative could be to allow perf interrupts here but always do the slowpath stack walk if we are in nmi context, but that slows down all perf interrupt stack walking on hash though and it does not remove as much tricky code. Reported-by: Laurent Dufour Signed-off-by: Nicholas Piggin Tested-by: Laurent Dufour Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220204035348.545435-1-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/mm/book3s64/hash_utils.c | 54 ++++++----------------------------- arch/powerpc/perf/callchain.h | 9 +----- arch/powerpc/perf/callchain_64.c | 27 ------------------ 4 files changed, 10 insertions(+), 82 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index fc28f46d2f9d..5404f7abbcf8 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -612,7 +612,7 @@ DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault); DECLARE_INTERRUPT_HANDLER(do_bad_segment_interrupt); /* hash_utils.c */ -DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault); +DECLARE_INTERRUPT_HANDLER(do_hash_fault); /* fault.c */ DECLARE_INTERRUPT_HANDLER(do_page_fault); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 7abf82a698d3..985cabdd7f67 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1621,8 +1621,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -DECLARE_INTERRUPT_HANDLER(__do_hash_fault); -DEFINE_INTERRUPT_HANDLER(__do_hash_fault) +DEFINE_INTERRUPT_HANDLER(do_hash_fault) { unsigned long ea = regs->dar; unsigned long dsisr = regs->dsisr; @@ -1681,35 +1680,6 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault) } } -/* - * The _RAW interrupt entry checks for the in_nmi() case before - * running the full handler. - */ -DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) -{ - /* - * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then - * don't call hash_page, just fail the fault. This is required to - * prevent re-entrancy problems in the hash code, namely perf - * interrupts hitting while something holds H_PAGE_BUSY, and taking a - * hash fault. See the comment in hash_preload(). - * - * We come here as a result of a DSI at a point where we don't want - * to call hash_page, such as when we are accessing memory (possibly - * user memory) inside a PMU interrupt that occurred while interrupts - * were soft-disabled. We want to invoke the exception handler for - * the access, or panic if there isn't a handler. - */ - if (unlikely(in_nmi())) { - do_bad_page_fault_segv(regs); - return 0; - } - - __do_hash_fault(regs); - - return 0; -} - #ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { @@ -1776,26 +1746,18 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* - * __hash_page_* must run with interrupts off, as it sets the - * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any - * time and may take a hash fault reading the user stack, see - * read_user_stack_slow() in the powerpc/perf code. - * - * If that takes a hash fault on the same page as we lock here, it - * will bail out when seeing H_PAGE_BUSY set, and retry the access - * leading to an infinite loop. + * __hash_page_* must run with interrupts off, including PMI interrupts + * off, as it sets the H_PAGE_BUSY bit. * - * Disabling interrupts here does not prevent perf interrupts, but it - * will prevent them taking hash faults (see the NMI test in - * do_hash_page), then read_user_stack's copy_from_user_nofault will - * fail and perf will fall back to read_user_stack_slow(), which - * walks the Linux page tables. + * It's otherwise possible for perf interrupts to hit at any time and + * may take a hash fault reading the user stack, which could take a + * hash miss and deadlock on the same H_PAGE_BUSY bit. * * Interrupts must also be off for the duration of the * mm_is_thread_local test and update, to prevent preempt running the * mm on another CPU (XXX: this may be racy vs kthread_use_mm). */ - local_irq_save(flags); + powerpc_local_irq_pmu_save(flags); /* Is that local to this CPU ? */ if (mm_is_thread_local(mm)) @@ -1820,7 +1782,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, mm_ctx_user_psize(&mm->context), pte_val(*ptep)); - local_irq_restore(flags); + powerpc_local_irq_pmu_restore(flags); } /* diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h index d6fa6e25234f..19a8d051ddf1 100644 --- a/arch/powerpc/perf/callchain.h +++ b/arch/powerpc/perf/callchain.h @@ -2,7 +2,6 @@ #ifndef _POWERPC_PERF_CALLCHAIN_H #define _POWERPC_PERF_CALLCHAIN_H -int read_user_stack_slow(const void __user *ptr, void *buf, int nb); void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, @@ -26,17 +25,11 @@ static inline int __read_user_stack(const void __user *ptr, void *ret, size_t size) { unsigned long addr = (unsigned long)ptr; - int rc; if (addr > TASK_SIZE - size || (addr & (size - 1))) return -EFAULT; - rc = copy_from_user_nofault(ret, ptr, size); - - if (IS_ENABLED(CONFIG_PPC64) && !radix_enabled() && rc) - return read_user_stack_slow(ptr, ret, size); - - return rc; + return copy_from_user_nofault(ret, ptr, size); } #endif /* _POWERPC_PERF_CALLCHAIN_H */ diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index 8d0df4226328..488e8a21a11e 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -18,33 +18,6 @@ #include "callchain.h" -/* - * On 64-bit we don't want to invoke hash_page on user addresses from - * interrupt context, so if the access faults, we read the page tables - * to find which page (if any) is mapped and access it directly. Radix - * has no need for this so it doesn't use read_user_stack_slow. - */ -int read_user_stack_slow(const void __user *ptr, void *buf, int nb) -{ - - unsigned long addr = (unsigned long) ptr; - unsigned long offset; - struct page *page; - void *kaddr; - - if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { - kaddr = page_address(page); - - /* align address to page boundary */ - offset = addr & ~PAGE_MASK; - - memcpy(buf, kaddr + offset, nb); - put_page(page); - return 0; - } - return -EFAULT; -} - static int read_user_stack_64(const unsigned long __user *ptr, unsigned long *ret) { return __read_user_stack(ptr, ret, sizeof(*ret)); -- cgit v1.2.3 From 34737e26980341519d00e84711fe619f9f47e79c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Feb 2022 08:50:00 +0100 Subject: uaccess: add generic __{get,put}_kernel_nofault Nine architectures are still missing __{get,put}_kernel_nofault: alpha, ia64, microblaze, nds32, nios2, openrisc, sh, sparc32, xtensa. Add a generic version that lets everything use the normal copy_{from,to}_kernel_nofault() code based on these, removing the last use of get_fs()/set_fs() from architecture-independent code. Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann --- arch/arm/include/asm/uaccess.h | 2 - arch/arm64/include/asm/uaccess.h | 2 - arch/m68k/include/asm/uaccess.h | 2 - arch/mips/include/asm/uaccess.h | 2 - arch/parisc/include/asm/uaccess.h | 1 - arch/powerpc/include/asm/uaccess.h | 2 - arch/riscv/include/asm/uaccess.h | 2 - arch/s390/include/asm/uaccess.h | 2 - arch/sparc/include/asm/uaccess_64.h | 2 - arch/um/include/asm/uaccess.h | 2 - arch/x86/include/asm/uaccess.h | 2 - include/asm-generic/uaccess.h | 2 - include/linux/uaccess.h | 19 +++++++ mm/maccess.c | 108 ------------------------------------ 14 files changed, 19 insertions(+), 131 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 32dbfd81f42a..d20d78c34b94 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -476,8 +476,6 @@ do { \ : "r" (x), "i" (-EFAULT) \ : "cc") -#define HAVE_GET_KERNEL_NOFAULT - #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ const type *__pk_ptr = (src); \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 3a5ff5e20586..2e20879fe3cf 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -26,8 +26,6 @@ #include #include -#define HAVE_GET_KERNEL_NOFAULT - /* * Test whether a block of memory is a valid user space address. * Returns 1 if the range is valid, 0 otherwise. diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index ba670523885c..79617c0b2f91 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -390,8 +390,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -#define HAVE_GET_KERNEL_NOFAULT - #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ type *__gk_dst = (type *)(dst); \ diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index f8f74f9f5883..db9a8e002b62 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -296,8 +296,6 @@ struct __large_struct { unsigned long buf[100]; }; (val) = __gu_tmp.t; \ } -#define HAVE_GET_KERNEL_NOFAULT - #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __gu_err; \ diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ebf8a845b017..0925bbd6db67 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -95,7 +95,6 @@ struct exception_table_entry { (val) = (__force __typeof__(*(ptr))) __gu_val; \ } -#define HAVE_GET_KERNEL_NOFAULT #define __get_kernel_nofault(dst, src, type, err_label) \ { \ type __z; \ diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 63316100080c..a0032c2e7550 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -467,8 +467,6 @@ do { \ unsafe_put_user(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e); \ } while (0) -#define HAVE_GET_KERNEL_NOFAULT - #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size_goto(*((type *)(dst)), \ (__force type __user *)(src), sizeof(type), err_label) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index c701a5e57a2b..4407b9e48d2c 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -346,8 +346,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) __clear_user(to, n) : n; } -#define HAVE_GET_KERNEL_NOFAULT - #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ long __kr_err; \ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index d74e26b48604..29332edf46f0 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -282,8 +282,6 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo int copy_to_user_real(void __user *dest, void *src, unsigned long count); void *s390_kernel_write(void *dst, const void *src, size_t size); -#define HAVE_GET_KERNEL_NOFAULT - int __noreturn __put_kernel_bad(void); #define __put_kernel_asm(val, to, insn) \ diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index b283798315b1..5c12fb46bc61 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -210,8 +210,6 @@ __asm__ __volatile__( \ : "=r" (ret), "=r" (x) : "r" (__m(addr)), \ "i" (-EFAULT)) -#define HAVE_GET_KERNEL_NOFAULT - #define __get_user_nocheck(data, addr, size, type) ({ \ register int __gu_ret; \ register unsigned long __gu_val; \ diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 17d18cfd82a5..1ecfc96bcc50 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -44,8 +44,6 @@ static inline int __access_ok(unsigned long addr, unsigned long size) } /* no pagefaults for kernel addresses in um */ -#define HAVE_GET_KERNEL_NOFAULT 1 - #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ *((type *)dst) = get_unaligned((type *)(src)); \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a59ba2578e64..201efcec66b7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -507,8 +507,6 @@ do { \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) -#define HAVE_GET_KERNEL_NOFAULT - #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index 10ffa8b5c117..0870fa11a7c5 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -77,8 +77,6 @@ do { \ goto err_label; \ } while (0) -#define HAVE_GET_KERNEL_NOFAULT 1 - static inline __must_check unsigned long raw_copy_from_user(void *to, const void __user * from, unsigned long n) { diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ac0394087f7d..67e9bc94dc40 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -368,6 +368,25 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); +#ifndef __get_kernel_nofault +#define __get_kernel_nofault(dst, src, type, label) \ +do { \ + type __user *p = (type __force __user *)(src); \ + type data; \ + if (__get_user(data, p)) \ + goto label; \ + *(type *)dst = data; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, label) \ +do { \ + type __user *p = (type __force __user *)(dst); \ + type data = *(type *)src; \ + if (__put_user(data, p)) \ + goto label; \ +} while (0) +#endif + /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable diff --git a/mm/maccess.c b/mm/maccess.c index d3f1a1f0b1c1..cbd1b3959af2 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -12,8 +12,6 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, return true; } -#ifdef HAVE_GET_KERNEL_NOFAULT - #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ @@ -102,112 +100,6 @@ Efault: dst[-1] = '\0'; return -EFAULT; } -#else /* HAVE_GET_KERNEL_NOFAULT */ -/** - * copy_from_kernel_nofault(): safely attempt to read from kernel-space - * @dst: pointer to the buffer that shall take the data - * @src: address to read from - * @size: size of the data chunk - * - * Safely read from kernel address @src to the buffer at @dst. If a kernel - * fault happens, handle that and return -EFAULT. If @src is not a valid kernel - * address, return -ERANGE. - * - * We ensure that the copy_from_user is executed in atomic context so that - * do_page_fault() doesn't attempt to take mmap_lock. This makes - * copy_from_kernel_nofault() suitable for use within regions where the caller - * already holds mmap_lock, or other locks which nest inside mmap_lock. - */ -long copy_from_kernel_nofault(void *dst, const void *src, size_t size) -{ - long ret; - mm_segment_t old_fs = get_fs(); - - if (!copy_from_kernel_nofault_allowed(src, size)) - return -ERANGE; - - set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, - size); - pagefault_enable(); - set_fs(old_fs); - - if (ret) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); - -/** - * copy_to_kernel_nofault(): safely attempt to write to a location - * @dst: address to write to - * @src: pointer to the data that shall be written - * @size: size of the data chunk - * - * Safely write to address @dst from the buffer at @src. If a kernel fault - * happens, handle that and return -EFAULT. - */ -long copy_to_kernel_nofault(void *dst, const void *src, size_t size) -{ - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - pagefault_enable(); - set_fs(old_fs); - - if (ret) - return -EFAULT; - return 0; -} - -/** - * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe - * address. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @unsafe_addr: Unsafe address. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from unsafe address to kernel buffer. - * - * On success, returns the length of the string INCLUDING the trailing NUL. - * - * If access fails, returns -EFAULT (some data may have been copied and the - * trailing NUL added). If @unsafe_addr is not a valid kernel address, return - * -ERANGE. - * - * If @count is smaller than the length of the string, copies @count-1 bytes, - * sets the last byte of @dst buffer to NUL and returns @count. - */ -long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) -{ - mm_segment_t old_fs = get_fs(); - const void *src = unsafe_addr; - long ret; - - if (unlikely(count <= 0)) - return 0; - if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) - return -ERANGE; - - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __get_user(*dst++, (const char __user __force *)src++); - } while (dst[-1] && ret == 0 && src - unsafe_addr < count); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - return ret ? -EFAULT : src - unsafe_addr; -} -#endif /* HAVE_GET_KERNEL_NOFAULT */ /** * copy_from_user_nofault(): safely attempt to read from a user-space location -- cgit v1.2.3 From 12700c17fc286149324f92d6d380bc48e43f253d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Feb 2022 17:55:04 +0100 Subject: uaccess: generalize access_ok() There are many different ways that access_ok() is defined across architectures, but in the end, they all just compare against the user_addr_max() value or they accept anything. Provide one definition that works for most architectures, checking against TASK_SIZE_MAX for user processes or skipping the check inside of uaccess_kernel() sections. For architectures without CONFIG_SET_FS(), this should be the fastest check, as it comes down to a single comparison of a pointer against a compile-time constant, while the architecture specific versions tend to do something more complex for historic reasons or get something wrong. Type checking for __user annotations is handled inconsistently across architectures, but this is easily simplified as well by using an inline function that takes a 'const void __user *' argument. A handful of callers need an extra __user annotation for this. Some architectures had trick to use 33-bit or 65-bit arithmetic on the addresses to calculate the overflow, however this simpler version uses fewer registers, which means it can produce better object code in the end despite needing a second (statically predicted) branch. Reviewed-by: Christoph Hellwig Acked-by: Mark Rutland [arm64, asm-generic] Acked-by: Geert Uytterhoeven Acked-by: Stafford Horne Acked-by: Dinh Nguyen Signed-off-by: Arnd Bergmann --- arch/Kconfig | 7 ++++ arch/alpha/include/asm/uaccess.h | 34 +++----------------- arch/arc/include/asm/uaccess.h | 29 ----------------- arch/arm/include/asm/uaccess.h | 20 +----------- arch/arm64/include/asm/uaccess.h | 11 +++---- arch/csky/include/asm/uaccess.h | 8 ----- arch/hexagon/include/asm/uaccess.h | 25 --------------- arch/ia64/include/asm/uaccess.h | 5 ++- arch/m68k/Kconfig.cpu | 1 + arch/m68k/include/asm/uaccess.h | 15 +-------- arch/microblaze/include/asm/uaccess.h | 8 +---- arch/mips/include/asm/uaccess.h | 29 +---------------- arch/nds32/include/asm/uaccess.h | 7 ++-- arch/nios2/include/asm/uaccess.h | 11 +------ arch/openrisc/include/asm/uaccess.h | 19 +---------- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/uaccess.h | 12 ++----- arch/powerpc/include/asm/uaccess.h | 11 +------ arch/riscv/include/asm/uaccess.h | 31 +----------------- arch/s390/Kconfig | 1 + arch/s390/include/asm/uaccess.h | 14 +------- arch/sh/include/asm/uaccess.h | 22 ++----------- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/uaccess.h | 3 -- arch/sparc/include/asm/uaccess_32.h | 18 +++-------- arch/sparc/include/asm/uaccess_64.h | 12 +------ arch/um/include/asm/uaccess.h | 5 +-- arch/x86/include/asm/uaccess.h | 14 ++------ arch/xtensa/include/asm/uaccess.h | 10 +----- include/asm-generic/access_ok.h | 60 +++++++++++++++++++++++++++++++++++ include/asm-generic/uaccess.h | 21 +----------- include/linux/uaccess.h | 7 ---- 32 files changed, 110 insertions(+), 362 deletions(-) create mode 100644 include/asm-generic/access_ok.h (limited to 'arch/powerpc/include') diff --git a/arch/Kconfig b/arch/Kconfig index 678a80713b21..fa5db36bda67 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -898,6 +898,13 @@ config HAVE_SOFTIRQ_ON_OWN_STACK Architecture provides a function to run __do_softirq() on a separate stack. +config ALTERNATE_USER_ADDRESS_SPACE + bool + help + Architectures set this when the CPU uses separate address + spaces for kernel and user space pointers. In this case, the + access_ok() check on a __user pointer is skipped. + config PGTABLE_LEVELS int default 2 diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 1b6f25efa247..82c5743fc9cd 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -20,28 +20,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -/* - * Is a address valid? This does a straightforward calculation rather - * than tests. - * - * Address valid if: - * - "addr" doesn't have any high-bits set - * - AND "size" doesn't have any high-bits set - * - AND "addr+size-(size != 0)" doesn't have any high-bits set - * - OR we are in kernel mode. - */ -#define __access_ok(addr, size) ({ \ - unsigned long __ao_a = (addr), __ao_b = (size); \ - unsigned long __ao_end = __ao_a + __ao_b - !!__ao_b; \ - (get_fs().seg & (__ao_a | __ao_b | __ao_end)) == 0; }) - -#define access_ok(addr, size) \ -({ \ - __chk_user_ptr(addr); \ - __access_ok(((unsigned long)(addr)), (size)); \ -}) +#include /* * These are the main single-value transfer routines. They automatically @@ -105,7 +84,7 @@ extern void __get_user_unknown(void); long __gu_err = -EFAULT; \ unsigned long __gu_val = 0; \ const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - if (__access_ok((unsigned long)__gu_addr, size)) { \ + if (__access_ok(__gu_addr, size)) { \ __gu_err = 0; \ switch (size) { \ case 1: __get_user_8(__gu_addr); break; \ @@ -200,7 +179,7 @@ extern void __put_user_unknown(void); ({ \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - if (__access_ok((unsigned long)__pu_addr, size)) { \ + if (__access_ok(__pu_addr, size)) { \ __pu_err = 0; \ switch (size) { \ case 1: __put_user_8(x, __pu_addr); break; \ @@ -316,17 +295,14 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long len) extern long __clear_user(void __user *to, long len); -extern inline long +static inline long clear_user(void __user *to, long len) { - if (__access_ok((unsigned long)to, len)) + if (__access_ok(to, len)) len = __clear_user(to, len); return len; } -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) - extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index 783bfdb3bfa3..30f80b4be2ab 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -23,35 +23,6 @@ #include /* for generic string functions */ - -#define __kernel_ok (uaccess_kernel()) - -/* - * Algorithmically, for __user_ok() we want do: - * (start < TASK_SIZE) && (start+len < TASK_SIZE) - * where TASK_SIZE could either be retrieved from thread_info->addr_limit or - * emitted directly in code. - * - * This can however be rewritten as follows: - * (len <= TASK_SIZE) && (start+len < TASK_SIZE) - * - * Because it essentially checks if buffer end is within limit and @len is - * non-ngeative, which implies that buffer start will be within limit too. - * - * The reason for rewriting being, for majority of cases, @len is generally - * compile time constant, causing first sub-expression to be compile time - * subsumed. - * - * The second part would generate weird large LIMMs e.g. (0x6000_0000 - 0x10), - * so we check for TASK_SIZE using get_fs() since the addr_limit load from mem - * would already have been done at this call site for __kernel_ok() - * - */ -#define __user_ok(addr, sz) (((sz) <= TASK_SIZE) && \ - ((addr) <= (get_fs() - (sz)))) -#define __access_ok(addr, sz) (unlikely(__kernel_ok) || \ - likely(__user_ok((addr), (sz)))) - /*********** Single byte/hword/word copies ******************/ #define __get_user_fn(sz, u, k) \ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index d20d78c34b94..2fcbec9c306c 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -55,21 +55,6 @@ extern int __put_user_bad(void); #ifdef CONFIG_MMU -/* - * We use 33-bit arithmetic here. Success returns zero, failure returns - * addr_limit. We take advantage that addr_limit will be zero for KERNEL_DS, - * so this will always return success in that case. - */ -#define __range_ok(addr, size) ({ \ - unsigned long flag, roksum; \ - __chk_user_ptr(addr); \ - __asm__(".syntax unified\n" \ - "adds %1, %2, %3; sbcscc %1, %1, %0; movcc %0, #0" \ - : "=&r" (flag), "=&r" (roksum) \ - : "r" (addr), "Ir" (size), "0" (TASK_SIZE) \ - : "cc"); \ - flag; }) - /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -241,15 +226,12 @@ extern int __put_user_8(void *, unsigned long long); #else /* CONFIG_MMU */ -#define __addr_ok(addr) ((void)(addr), 1) -#define __range_ok(addr, size) ((void)(addr), 0) - #define get_user(x, p) __get_user(x, p) #define __put_user_check __put_user_nocheck #endif /* CONFIG_MMU */ -#define access_ok(addr, size) (__range_ok(addr, size) == 0) +#include #ifdef CONFIG_CPU_SPECTRE /* diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 199c553b740a..e8dce0cc5eaa 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -26,13 +26,7 @@ #include #include -static inline int __access_ok(const void __user *ptr, unsigned long size) -{ - unsigned long limit = TASK_SIZE_MAX; - unsigned long addr = (unsigned long)ptr; - - return (size <= limit) && (addr <= (limit - size)); -} +static inline int __access_ok(const void __user *ptr, unsigned long size); /* * Test whether a block of memory is a valid user space address. @@ -54,6 +48,9 @@ static inline int access_ok(const void __user *addr, unsigned long size) return likely(__access_ok(addr, size)); } +#define access_ok access_ok + +#include /* * User access enabling/disabling. diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h index ac5a54f57d40..fec8f77ffc99 100644 --- a/arch/csky/include/asm/uaccess.h +++ b/arch/csky/include/asm/uaccess.h @@ -5,14 +5,6 @@ #define user_addr_max() (current_thread_info()->addr_limit.seg) -static inline int __access_ok(unsigned long addr, unsigned long size) -{ - unsigned long limit = user_addr_max(); - - return (size <= limit) && (addr <= (limit - size)); -} -#define __access_ok __access_ok - /* * __put_user_fn */ diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h index 719ba3f3c45c..bff77efc0d9a 100644 --- a/arch/hexagon/include/asm/uaccess.h +++ b/arch/hexagon/include/asm/uaccess.h @@ -12,31 +12,6 @@ */ #include -/* - * access_ok: - Checks if a user space pointer is valid - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Returns true (nonzero) if the memory block *may* be valid, false (zero) - * if it is definitely invalid. - * - */ -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) - -static inline int __access_ok(unsigned long addr, unsigned long size) -{ - unsigned long limit = TASK_SIZE; - - return (size <= limit) && (addr <= (limit - size)); -} -#define __access_ok __access_ok - /* * When a kernel-mode page fault is taken, the faulting instruction * address is checked against a table of exception_table_entries. diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index e19d2dcc0ced..e242a3cc1330 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -50,8 +50,6 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - /* * When accessing user memory, we need to make sure the entire area really is in * user-level space. In order to do this efficiently, we make sure that the page at @@ -65,7 +63,8 @@ static inline int __access_ok(const void __user *p, unsigned long size) return likely(addr <= seg) && (seg == KERNEL_DS.seg || likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT)); } -#define access_ok(addr, size) __access_ok((addr), (size)) +#define __access_ok __access_ok +#include /* * These are the main single-value transfer routines. They automatically diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 0d00ef5117dc..16ea9a67723c 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -453,6 +453,7 @@ config CPU_HAS_NO_UNALIGNED config CPU_HAS_ADDRESS_SPACES bool + select ALTERNATE_USER_ADDRESS_SPACE config FPU bool diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 9f0f1b6e14ed..64914872a5c9 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -10,20 +10,7 @@ #include #include #include - -/* We let the MMU do all checking */ -static inline int access_ok(const void __user *ptr, - unsigned long size) -{ - unsigned long limit = TASK_SIZE; - unsigned long addr = (unsigned long)ptr; - - if (IS_ENABLED(CONFIG_CPU_HAS_ADDRESS_SPACES) || - !IS_ENABLED(CONFIG_MMU)) - return 1; - - return (size <= limit) && (addr <= (limit - size)); -} +#include /* * Not all varients of the 68k family support the notion of address spaces. diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 3fe96979d2c6..bf9b7657a65a 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -39,13 +39,7 @@ # define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -static inline int __access_ok(unsigned long addr, unsigned long size) -{ - unsigned long limit = user_addr_max(); - - return (size <= limit) && (addr <= (limit - size)); -} -#define access_ok(addr, size) __access_ok((unsigned long)addr, size) +#include # define __FIXUP_SECTION ".section .fixup,\"ax\"\n" # define __EX_TABLE_SECTION ".section __ex_table,\"a\"\n" diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 73e543bc2e0e..c0cede273c7c 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -44,34 +44,7 @@ extern u64 __ua_limit; #endif /* CONFIG_64BIT */ -/* - * access_ok: - Checks if a user space pointer is valid - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Returns true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - * - * Note that, depending on architecture, this function probably just - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - */ - -static inline int __access_ok(const void __user *p, unsigned long size) -{ - unsigned long addr = (unsigned long)p; - unsigned long limit = TASK_SIZE_MAX; - - return (size <= limit) && (addr <= (limit - size)); -} - -#define access_ok(addr, size) \ - likely(__access_ok((addr), (size))) +#include /* * put_user: - Write a simple value into user space. diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 37a40981deb3..832d642a4068 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -38,18 +38,15 @@ extern int fixup_exception(struct pt_regs *regs); #define get_fs() (current_thread_info()->addr_limit) #define user_addr_max get_fs +#define uaccess_kernel() (get_fs() == KERNEL_DS) static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; } -#define uaccess_kernel() (get_fs() == KERNEL_DS) +#include -#define __range_ok(addr, size) (size <= get_fs() && addr <= (get_fs() -size)) - -#define access_ok(addr, size) \ - __range_ok((unsigned long)addr, (unsigned long)size) /* * Single-value transfer routines. They automatically use the right * size if we just have the right pointer type. Note that the functions diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index a5cbe07cf0da..6664ddc0e8e5 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -30,19 +30,10 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(seg) (current_thread_info()->addr_limit = (seg)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#define __access_ok(addr, len) \ - (((signed long)(((long)get_fs().seg) & \ - ((long)(addr) | (((long)(addr)) + (len)) | (len)))) == 0) - -#define access_ok(addr, len) \ - likely(__access_ok((unsigned long)(addr), (unsigned long)(len))) +#include # define __EX_TABLE_SECTION ".section __ex_table,\"a\"\n" -#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) - /* * Zero Userspace */ diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index 120f5005461b..8f049ec99b3e 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -45,21 +45,7 @@ #define uaccess_kernel() (get_fs() == KERNEL_DS) -/* Ensure that the range from addr to addr+size is all within the process' - * address space - */ -static inline int __range_ok(unsigned long addr, unsigned long size) -{ - const mm_segment_t fs = get_fs(); - - return size <= fs && addr <= (fs - size); -} - -#define access_ok(addr, size) \ -({ \ - __chk_user_ptr(addr); \ - __range_ok((unsigned long)(addr), (size)); \ -}) +#include /* * These are the main single-value transfer routines. They automatically @@ -268,9 +254,6 @@ clear_user(void __user *addr, unsigned long size) return size; } -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) - extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 43c1c880def6..15039fdd5413 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config PARISC def_bool y + select ALTERNATE_USER_ADDRESS_SPACE select ARCH_32BIT_OFF_T if !64BIT select ARCH_MIGHT_HAVE_PC_PARPORT select HAVE_FUNCTION_TRACER diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 0925bbd6db67..187f4bdff13e 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -11,15 +11,9 @@ #include #include -/* - * Note that since kernel addresses are in a separate address space on - * parisc, we don't need to do anything for access_ok(). - * We just let the page fault handler do the right thing. This also means - * that put_user is the same as __put_user, etc. - */ - -#define access_ok(uaddr, size) \ - ( (uaddr) == (uaddr) ) +#define TASK_SIZE_MAX DEFAULT_TASK_SIZE +#include +#include #define put_user __put_user #define get_user __get_user diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a0032c2e7550..2e83217f52de 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -11,18 +11,9 @@ #ifdef __powerpc64__ /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */ #define TASK_SIZE_MAX TASK_SIZE_USER64 -#else -#define TASK_SIZE_MAX TASK_SIZE #endif -static inline bool __access_ok(unsigned long addr, unsigned long size) -{ - return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; -} - -#define access_ok(addr, size) \ - (__chk_user_ptr(addr), \ - __access_ok((unsigned long)(addr), (size))) +#include /* * These are the main single-value transfer routines. They automatically diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 4407b9e48d2c..855450bed9f5 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -21,42 +21,13 @@ #include #include #include +#include #define __enable_user_access() \ __asm__ __volatile__ ("csrs sstatus, %0" : : "r" (SR_SUM) : "memory") #define __disable_user_access() \ __asm__ __volatile__ ("csrc sstatus, %0" : : "r" (SR_SUM) : "memory") -/** - * access_ok: - Checks if a user space pointer is valid - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Returns true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - * - * Note that, depending on architecture, this function probably just - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - */ -#define access_ok(addr, size) ({ \ - __chk_user_ptr(addr); \ - likely(__access_ok((unsigned long __force)(addr), (size))); \ -}) - -/* - * Ensure that the range [addr, addr+size) is within the process's - * address space - */ -static inline int __access_ok(unsigned long addr, unsigned long size) -{ - return size <= TASK_SIZE && addr <= TASK_SIZE - size; -} - /* * The exception table consists of pairs of addresses: the first is the * address of an instruction that is allowed to fault, and the second is diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index be9f39fd06df..fb48a62aa985 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -55,6 +55,7 @@ config S390 # Note: keep this list sorted alphabetically # imply IMA_SECURE_AND_OR_TRUSTED_BOOT + select ALTERNATE_USER_ADDRESS_SPACE select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 29332edf46f0..5cb258cd9d29 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -17,22 +17,10 @@ #include #include #include +#include void debug_user_asce(int exit); -static inline int __range_ok(unsigned long addr, unsigned long size) -{ - return 1; -} - -#define __access_ok(addr, size) \ -({ \ - __chk_user_ptr(addr); \ - __range_ok((unsigned long)(addr), (size)); \ -}) - -#define access_ok(addr, size) __access_ok(addr, size) - unsigned long __must_check raw_copy_from_user(void *to, const void __user *from, unsigned long n); diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h index 8867bb04b00e..ccd219d74851 100644 --- a/arch/sh/include/asm/uaccess.h +++ b/arch/sh/include/asm/uaccess.h @@ -5,28 +5,10 @@ #include #include -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < current_thread_info()->addr_limit.seg) - -/* - * __access_ok: Check if address with size is OK or not. - * - * Uhhuh, this needs 33-bit arithmetic. We have a carry.. - * - * sum := addr + size; carry? --> flag = true; - * if (sum >= addr_limit) flag = true; - */ -#define __access_ok(addr, size) ({ \ - unsigned long __ao_a = (addr), __ao_b = (size); \ - unsigned long __ao_end = __ao_a + __ao_b - !!__ao_b; \ - __ao_end >= __ao_a && __addr_ok(__ao_end); }) - -#define access_ok(addr, size) \ - (__chk_user_ptr(addr), \ - __access_ok((unsigned long __force)(addr), (size))) - #define user_addr_max() (current_thread_info()->addr_limit.seg) +#include + /* * Uh, these should become the main single-value transfer routines ... * They automatically use the right size if we just have the right diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 1cab1b284f1a..9f6f9bce5292 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -62,6 +62,7 @@ config SPARC32 config SPARC64 def_bool 64BIT + select ALTERNATE_USER_ADDRESS_SPACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_KRETPROBES diff --git a/arch/sparc/include/asm/uaccess.h b/arch/sparc/include/asm/uaccess.h index 390094200fc4..ee75f69e3fcd 100644 --- a/arch/sparc/include/asm/uaccess.h +++ b/arch/sparc/include/asm/uaccess.h @@ -10,9 +10,6 @@ #include #endif -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) - long strncpy_from_user(char *dest, const char __user *src, long count); #endif diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index 4a12346bb69c..367747116260 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -25,17 +25,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) ((current->thread.current_ds) = (val)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -/* We have there a nice not-mapped page at PAGE_OFFSET - PAGE_SIZE, so that this test - * can be fairly lightweight. - * No one can read/write anything from userland in the kernel space by setting - * large size and address near to PAGE_OFFSET - a fault will break his intentions. - */ -#define __user_ok(addr, size) ({ (void)(size); (addr) < STACK_TOP; }) -#define __kernel_ok (uaccess_kernel()) -#define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size))) -#define access_ok(addr, size) __access_ok((unsigned long)(addr), size) +#include /* Uh, these should become the main single-value transfer routines.. * They automatically use the right size if we just have the right @@ -47,13 +37,13 @@ * and hide all the ugliness from the user. */ #define put_user(x, ptr) ({ \ - unsigned long __pu_addr = (unsigned long)(ptr); \ + void __user *__pu_addr = (ptr); \ __chk_user_ptr(ptr); \ __put_user_check((__typeof__(*(ptr)))(x), __pu_addr, sizeof(*(ptr))); \ }) #define get_user(x, ptr) ({ \ - unsigned long __gu_addr = (unsigned long)(ptr); \ + const void __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ __get_user_check((x), __gu_addr, sizeof(*(ptr)), __typeof__(*(ptr))); \ }) @@ -232,7 +222,7 @@ static inline unsigned long __clear_user(void __user *addr, unsigned long size) static inline unsigned long clear_user(void __user *addr, unsigned long n) { - if (n && __access_ok((unsigned long) addr, n)) + if (n && __access_ok(addr, n)) return __clear_user(addr, n); else return n; diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index 5c12fb46bc61..59b9a545df23 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -31,7 +31,7 @@ #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)}) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) +#include #define set_fs(val) \ do { \ @@ -61,16 +61,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) -static inline int __access_ok(const void __user * addr, unsigned long size) -{ - return 1; -} - -static inline int access_ok(const void __user * addr, unsigned long size) -{ - return 1; -} - void __retl_efault(void); /* Uh, these should become the main single-value transfer routines.. diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 1ecfc96bcc50..7d9d60e41e4e 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -25,7 +25,7 @@ extern unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n); extern unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __clear_user(void __user *mem, unsigned long len); -static inline int __access_ok(unsigned long addr, unsigned long size); +static inline int __access_ok(const void __user *ptr, unsigned long size); /* Teach asm-generic/uaccess.h that we have C functions for these. */ #define __access_ok __access_ok @@ -36,8 +36,9 @@ static inline int __access_ok(unsigned long addr, unsigned long size); #include -static inline int __access_ok(unsigned long addr, unsigned long size) +static inline int __access_ok(const void __user *ptr, unsigned long size) { + unsigned long addr = (unsigned long)ptr; return __addr_range_nowrap(addr, size) && (__under_task_size(addr, size) || __access_ok_vsyscall(addr, size)); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 201efcec66b7..f78e2b3501a1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -12,18 +12,6 @@ #include #include -/* - * Test whether a block of memory is a valid user space address. - * Returns 0 if the range is valid, nonzero otherwise. - */ -static inline bool __access_ok(void __user *ptr, unsigned long size) -{ - unsigned long limit = TASK_SIZE_MAX; - unsigned long addr = ptr; - - return (size <= limit) && (addr <= (limit - size)); -} - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() \ @@ -55,6 +43,8 @@ static inline bool pagefault_disabled(void); likely(__access_ok(addr, size)); \ }) +#include + extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index 75bd8fbf52ba..0edd9e4b23d0 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -35,15 +35,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) (current->thread.current_ds = (val)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#define __kernel_ok (uaccess_kernel()) -#define __user_ok(addr, size) \ - (((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size))) -#define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size))) -#define access_ok(addr, size) __access_ok((unsigned long)(addr), (size)) - -#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) +#include /* * These are the main single-value transfer routines. They diff --git a/include/asm-generic/access_ok.h b/include/asm-generic/access_ok.h new file mode 100644 index 000000000000..d38cc5dad65b --- /dev/null +++ b/include/asm-generic/access_ok.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_ACCESS_OK_H__ +#define __ASM_GENERIC_ACCESS_OK_H__ + +/* + * Checking whether a pointer is valid for user space access. + * These definitions work on most architectures, but overrides can + * be used where necessary. + */ + +/* + * architectures with compat tasks have a variable TASK_SIZE and should + * override this to a constant. + */ +#ifndef TASK_SIZE_MAX +#define TASK_SIZE_MAX TASK_SIZE +#endif + +#ifndef uaccess_kernel +#ifdef CONFIG_SET_FS +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) +#else +#define uaccess_kernel() (0) +#endif +#endif + +#ifndef user_addr_max +#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE_MAX) +#endif + +#ifndef __access_ok +/* + * 'size' is a compile-time constant for most callers, so optimize for + * this case to turn the check into a single comparison against a constant + * limit and catch all possible overflows. + * On architectures with separate user address space (m68k, s390, parisc, + * sparc64) or those without an MMU, this should always return true. + * + * This version was originally contributed by Jonas Bonn for the + * OpenRISC architecture, and was found to be the most efficient + * for constant 'size' and 'limit' values. + */ +static inline int __access_ok(const void __user *ptr, unsigned long size) +{ + unsigned long limit = user_addr_max(); + unsigned long addr = (unsigned long)ptr; + + if (IS_ENABLED(CONFIG_ALTERNATE_USER_ADDRESS_SPACE) || + !IS_ENABLED(CONFIG_MMU)) + return true; + + return (size <= limit) && (addr <= (limit - size)); +} +#endif + +#ifndef access_ok +#define access_ok(addr, size) likely(__access_ok(addr, size)) +#endif + +#endif diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index 0870fa11a7c5..ebc685dc8d74 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -114,28 +114,9 @@ static inline void set_fs(mm_segment_t fs) } #endif -#ifndef uaccess_kernel -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#endif - -#ifndef user_addr_max -#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) -#endif - #endif /* CONFIG_SET_FS */ -#define access_ok(addr, size) __access_ok((unsigned long)(addr),(size)) - -/* - * The architecture should really override this if possible, at least - * doing a check on the get_fs() - */ -#ifndef __access_ok -static inline int __access_ok(unsigned long addr, unsigned long size) -{ - return 1; -} -#endif +#include /* * These are the main single-value transfer routines. They automatically diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 67e9bc94dc40..2c31667e62e0 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -33,13 +33,6 @@ typedef struct { /* empty dummy */ } mm_segment_t; -#ifndef TASK_SIZE_MAX -#define TASK_SIZE_MAX TASK_SIZE -#endif - -#define uaccess_kernel() (false) -#define user_addr_max() (TASK_SIZE_MAX) - static inline mm_segment_t force_uaccess_begin(void) { return (mm_segment_t) { }; -- cgit v1.2.3 From 973e2e6462405d85d3e8bb02d516d5fe6d1193ed Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 25 Feb 2022 17:36:22 +0100 Subject: powerpc/interrupt: Remove struct interrupt_state Since commit ceff77efa4f8 ("powerpc/64e/interrupt: Use new interrupt context tracking scheme") struct interrupt_state has been empty and unused. Remove it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1d862ce3eab3da6ca7ac47d4a78a18f154462511.1645806970.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/interrupt.h | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 5404f7abbcf8..f3b2c93a5db1 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -123,9 +123,6 @@ static inline void nap_adjust_return(struct pt_regs *regs) #endif } -struct interrupt_state { -}; - static inline void booke_restore_dbcr0(void) { #ifdef CONFIG_PPC_ADV_DEBUG_REGS @@ -138,7 +135,7 @@ static inline void booke_restore_dbcr0(void) #endif } -static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) +static inline void interrupt_enter_prepare(struct pt_regs *regs) { #ifdef CONFIG_PPC32 if (!arch_irq_disabled_regs(regs)) @@ -228,17 +225,17 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup * However interrupt_nmi_exit_prepare does return directly to regs, because * NMIs do not do "exit work" or replay soft-masked interrupts. */ -static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt_state *state) +static inline void interrupt_exit_prepare(struct pt_regs *regs) { } -static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) +static inline void interrupt_async_enter_prepare(struct pt_regs *regs) { #ifdef CONFIG_PPC64 /* Ensure interrupt_enter_prepare does not enable MSR[EE] */ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; #endif - interrupt_enter_prepare(regs, state); + interrupt_enter_prepare(regs); #ifdef CONFIG_PPC_BOOK3S_64 /* * RI=1 is set by interrupt_enter_prepare, so this thread flags access @@ -251,7 +248,7 @@ static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct in irq_enter(); } -static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct interrupt_state *state) +static inline void interrupt_async_exit_prepare(struct pt_regs *regs) { /* * Adjust at exit so the main handler sees the true NIA. This must @@ -262,7 +259,7 @@ static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct int nap_adjust_return(regs); irq_exit(); - interrupt_exit_prepare(regs, state); + interrupt_exit_prepare(regs); } struct interrupt_nmi_state { @@ -447,13 +444,11 @@ static __always_inline void ____##func(struct pt_regs *regs); \ \ interrupt_handler void func(struct pt_regs *regs) \ { \ - struct interrupt_state state; \ - \ - interrupt_enter_prepare(regs, &state); \ + interrupt_enter_prepare(regs); \ \ ____##func (regs); \ \ - interrupt_exit_prepare(regs, &state); \ + interrupt_exit_prepare(regs); \ } \ NOKPROBE_SYMBOL(func); \ \ @@ -482,14 +477,13 @@ static __always_inline long ____##func(struct pt_regs *regs); \ \ interrupt_handler long func(struct pt_regs *regs) \ { \ - struct interrupt_state state; \ long ret; \ \ - interrupt_enter_prepare(regs, &state); \ + interrupt_enter_prepare(regs); \ \ ret = ____##func (regs); \ \ - interrupt_exit_prepare(regs, &state); \ + interrupt_exit_prepare(regs); \ \ return ret; \ } \ @@ -518,13 +512,11 @@ static __always_inline void ____##func(struct pt_regs *regs); \ \ interrupt_handler void func(struct pt_regs *regs) \ { \ - struct interrupt_state state; \ - \ - interrupt_async_enter_prepare(regs, &state); \ + interrupt_async_enter_prepare(regs); \ \ ____##func (regs); \ \ - interrupt_async_exit_prepare(regs, &state); \ + interrupt_async_exit_prepare(regs); \ } \ NOKPROBE_SYMBOL(func); \ \ -- cgit v1.2.3 From 8667d0d64dd1f84fd41b5897fd87fa9113ae05e3 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 24 Feb 2022 17:22:14 +0100 Subject: powerpc: Fix build errors with newer binutils Building tinyconfig with gcc (Debian 11.2.0-16) and assembler (Debian 2.37.90.20220207) the following build error shows up: {standard input}: Assembler messages: {standard input}:1190: Error: unrecognized opcode: `stbcix' {standard input}:1433: Error: unrecognized opcode: `lwzcix' {standard input}:1453: Error: unrecognized opcode: `stbcix' {standard input}:1460: Error: unrecognized opcode: `stwcix' {standard input}:1596: Error: unrecognized opcode: `stbcix' ... Rework to add assembler directives [1] around the instruction. Going through them one by one shows that the changes should be safe. Like __get_user_atomic_128_aligned() is only called in p9_hmi_special_emu(), which according to the name is specific to power9. And __raw_rm_read*() are only called in things that are powernv or book3s_hv specific. [1] https://sourceware.org/binutils/docs/as/PowerPC_002dPseudo.html#PowerPC_002dPseudo Cc: stable@vger.kernel.org Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Anders Roxell Reviewed-by: Segher Boessenkool [mpe: Make commit subject more descriptive] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220224162215.3406642-2-anders.roxell@linaro.org --- arch/powerpc/include/asm/io.h | 40 ++++++++++++++++++++++++++++-------- arch/powerpc/include/asm/uaccess.h | 3 +++ arch/powerpc/platforms/powernv/rng.c | 6 +++++- 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index beba4979bff9..fee979d3a1aa 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -359,25 +359,37 @@ static inline void __raw_writeq_be(unsigned long v, volatile void __iomem *addr) */ static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr) { - __asm__ __volatile__("stbcix %0,0,%1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + stbcix %0,0,%1; \ + .machine pop;" : : "r" (val), "r" (paddr) : "memory"); } static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr) { - __asm__ __volatile__("sthcix %0,0,%1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + sthcix %0,0,%1; \ + .machine pop;" : : "r" (val), "r" (paddr) : "memory"); } static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr) { - __asm__ __volatile__("stwcix %0,0,%1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + stwcix %0,0,%1; \ + .machine pop;" : : "r" (val), "r" (paddr) : "memory"); } static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) { - __asm__ __volatile__("stdcix %0,0,%1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + stdcix %0,0,%1; \ + .machine pop;" : : "r" (val), "r" (paddr) : "memory"); } @@ -389,7 +401,10 @@ static inline void __raw_rm_writeq_be(u64 val, volatile void __iomem *paddr) static inline u8 __raw_rm_readb(volatile void __iomem *paddr) { u8 ret; - __asm__ __volatile__("lbzcix %0,0, %1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + lbzcix %0,0, %1; \ + .machine pop;" : "=r" (ret) : "r" (paddr) : "memory"); return ret; } @@ -397,7 +412,10 @@ static inline u8 __raw_rm_readb(volatile void __iomem *paddr) static inline u16 __raw_rm_readw(volatile void __iomem *paddr) { u16 ret; - __asm__ __volatile__("lhzcix %0,0, %1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + lhzcix %0,0, %1; \ + .machine pop;" : "=r" (ret) : "r" (paddr) : "memory"); return ret; } @@ -405,7 +423,10 @@ static inline u16 __raw_rm_readw(volatile void __iomem *paddr) static inline u32 __raw_rm_readl(volatile void __iomem *paddr) { u32 ret; - __asm__ __volatile__("lwzcix %0,0, %1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + lwzcix %0,0, %1; \ + .machine pop;" : "=r" (ret) : "r" (paddr) : "memory"); return ret; } @@ -413,7 +434,10 @@ static inline u32 __raw_rm_readl(volatile void __iomem *paddr) static inline u64 __raw_rm_readq(volatile void __iomem *paddr) { u64 ret; - __asm__ __volatile__("ldcix %0,0, %1" + __asm__ __volatile__(".machine push; \ + .machine power6; \ + ldcix %0,0, %1; \ + .machine pop;" : "=r" (ret) : "r" (paddr) : "memory"); return ret; } diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 63316100080c..4a35423f766d 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -125,8 +125,11 @@ do { \ */ #define __get_user_atomic_128_aligned(kaddr, uaddr, err) \ __asm__ __volatile__( \ + ".machine push\n" \ + ".machine altivec\n" \ "1: lvx 0,0,%1 # get user\n" \ " stvx 0,0,%2 # put kernel\n" \ + ".machine pop\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: li %0,%3\n" \ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index b4386714494a..e3d44b36ae98 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -43,7 +43,11 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) unsigned long parity; /* Calculate the parity of the value */ - asm ("popcntd %0,%1" : "=r" (parity) : "r" (val)); + asm (".machine push; \ + .machine power7; \ + popcntd %0,%1; \ + .machine pop;" + : "=r" (parity) : "r" (val)); /* xor our value with the previous mask */ val ^= rng->mask; -- cgit v1.2.3 From 58dbe9b373df2828d873b1c0e5afc77485b2f376 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Tue, 1 Mar 2022 17:47:43 -0300 Subject: powerpc/64s: Fix build failure when CONFIG_PPC_64S_HASH_MMU is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following build failure occurs when CONFIG_PPC_64S_HASH_MMU is not set: arch/powerpc/kernel/setup_64.c: In function ‘setup_per_cpu_areas’: arch/powerpc/kernel/setup_64.c:811:21: error: ‘mmu_linear_psize’ undeclared (first use in this function); did you mean ‘mmu_virtual_psize’? 811 | if (mmu_linear_psize == MMU_PAGE_4K) | ^~~~~~~~~~~~~~~~ | mmu_virtual_psize arch/powerpc/kernel/setup_64.c:811:21: note: each undeclared identifier is reported only once for each function it appears in Move the declaration of mmu_linear_psize outside of CONFIG_PPC_64S_HASH_MMU ifdef. After the above is fixed, it fails later with the following error: ld: arch/powerpc/kexec/file_load_64.o: in function `.arch_kexec_kernel_image_probe': file_load_64.c:(.text+0x1c1c): undefined reference to `.add_htab_mem_range' Fix that, too, by conditioning add_htab_mem_range() symbol to CONFIG_PPC_64S_HASH_MMU. Fixes: 387e220a2e5e ("powerpc/64s: Move hash MMU support code under CONFIG_PPC_64S_HASH_MMU") Reported-by: Erhard F. Signed-off-by: Murilo Opsfelder Araujo Signed-off-by: Michael Ellerman BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215567 Link: https://lore.kernel.org/r/20220301204743.45133-1-muriloo@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/include/asm/kexec_ranges.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index ba5b1becf518..006cbec70ffe 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -202,7 +202,6 @@ static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; extern int mmu_virtual_psize; extern int mmu_vmalloc_psize; extern int mmu_io_psize; @@ -213,6 +212,7 @@ extern int mmu_io_psize; #define mmu_virtual_psize MMU_PAGE_4K #endif #endif +extern int mmu_linear_psize; extern int mmu_vmemmap_psize; /* MMU initialization */ diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index 7a90000f8d15..f83866a19e87 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -9,7 +9,7 @@ struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int add_tce_mem_ranges(struct crash_mem **mem_ranges); int add_initrd_mem_range(struct crash_mem **mem_ranges); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU int add_htab_mem_range(struct crash_mem **mem_ranges); #else static inline int add_htab_mem_range(struct crash_mem **mem_ranges) -- cgit v1.2.3 From 48015b632f770c401f3816f144499a39f2884677 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 11 Feb 2022 17:32:37 +1100 Subject: powerpc: Fix STACKTRACE=n build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our skiroot_defconfig doesn't enable FTRACE, and so doesn't get STACKTRACE enabled either. That leads to a build failure since commit 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") made stacktrace.c build even when STACKTRACE=n. arch/powerpc/kernel/stacktrace.c: In function ‘handle_backtrace_ipi’: arch/powerpc/kernel/stacktrace.c:171:2: error: implicit declaration of function ‘nmi_cpu_backtrace’ 171 | nmi_cpu_backtrace(regs); | ^~~~~~~~~~~~~~~~~ arch/powerpc/kernel/stacktrace.c: In function ‘arch_trigger_cpumask_backtrace’: arch/powerpc/kernel/stacktrace.c:226:2: error: implicit declaration of function ‘nmi_trigger_cpumask_backtrace’ 226 | nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This happens because our headers haven't defined arch_trigger_cpumask_backtrace, which causes lib/nmi_backtrace.c not to build nmi_cpu_backtrace(). The code in question doesn't actually depend on STACKTRACE=y, that was just added because arch_trigger_cpumask_backtrace() lived in stacktrace.c for convenience. So drop the dependency on CONFIG_STACKTRACE, that causes lib/nmi_backtrace.c to build nmi_cpu_backtrace() etc. and fixes the build. Fixes: 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") [mpe: Cherry pick of 5a72345e6a78 from next into fixes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220212111349.2806972-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 160abcb8e9fa..ea0e487f87b1 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -9,7 +9,7 @@ long soft_nmi_interrupt(struct pt_regs *regs); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) +#ifdef CONFIG_NMI_IPI extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace -- cgit v1.2.3 From 1fe3a33ba0a37e7aa0df0acbe31d5dda7610c16e Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Mon, 28 Feb 2022 17:12:41 -0800 Subject: powerpc/vas: Add paste address mmap fault handler The user space opens VAS windows and issues NX requests by pasting CRB on the corresponding paste address mmap. When the system lost credits due to core removal, the kernel has to close the window in the hypervisor and make the window inactive by unmapping this paste address. Also the OS has to handle NX request page faults if the user space issue NX requests. This handler maps the new paste address with the same VMA when the window is active again (due to core add with DLPAR). Otherwise returns paste failure. Signed-off-by: Haren Myneni Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3956e1c1fdfde69127055ff1c0256c7d71104030.camel@linux.ibm.com --- arch/powerpc/include/asm/vas.h | 10 +++++ arch/powerpc/platforms/book3s/vas-api.c | 68 +++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index 57573d9c1e09..27251af18c65 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -29,6 +29,12 @@ #define VAS_THRESH_FIFO_GT_QTR_FULL 2 #define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3 +/* + * VAS window Linux status bits + */ +#define VAS_WIN_ACTIVE 0x0 /* Used in platform independent */ + /* vas mmap() */ + /* * Get/Set bit fields */ @@ -59,6 +65,9 @@ struct vas_user_win_ref { struct pid *pid; /* PID of owner */ struct pid *tgid; /* Thread group ID of owner */ struct mm_struct *mm; /* Linux process mm_struct */ + struct mutex mmap_mutex; /* protects paste address mmap() */ + /* with DLPAR close/open windows */ + struct vm_area_struct *vma; /* Save VMA and used in DLPAR ops */ }; /* @@ -67,6 +76,7 @@ struct vas_user_win_ref { struct vas_window { u32 winid; u32 wcreds_max; /* Window credits */ + u32 status; /* Window status used in OS */ enum vas_cop_type cop; struct vas_user_win_ref task_ref; char *dbgname; diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 4d82c92ddd52..217b4a624d09 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -316,6 +316,7 @@ static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg) return PTR_ERR(txwin); } + mutex_init(&txwin->task_ref.mmap_mutex); cp_inst->txwin = txwin; return 0; @@ -350,6 +351,70 @@ static int coproc_release(struct inode *inode, struct file *fp) return 0; } +/* + * This fault handler is invoked when the core generates page fault on + * the paste address. Happens if the kernel closes window in hypervisor + * (on pseries) due to lost credit or the paste address is not mapped. + */ +static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *fp = vma->vm_file; + struct coproc_instance *cp_inst = fp->private_data; + struct vas_window *txwin; + vm_fault_t fault; + u64 paste_addr; + + /* + * window is not opened. Shouldn't expect this error. + */ + if (!cp_inst || !cp_inst->txwin) { + pr_err("%s(): Unexpected fault on paste address with TX window closed\n", + __func__); + return VM_FAULT_SIGBUS; + } + + txwin = cp_inst->txwin; + /* + * When the LPAR lost credits due to core removal or during + * migration, invalidate the existing mapping for the current + * paste addresses and set windows in-active (zap_page_range in + * reconfig_close_windows()). + * New mapping will be done later after migration or new credits + * available. So continue to receive faults if the user space + * issue NX request. + */ + if (txwin->task_ref.vma != vmf->vma) { + pr_err("%s(): No previous mapping with paste address\n", + __func__); + return VM_FAULT_SIGBUS; + } + + mutex_lock(&txwin->task_ref.mmap_mutex); + /* + * The window may be inactive due to lost credit (Ex: core + * removal with DLPAR). If the window is active again when + * the credit is available, map the new paste address at the + * the window virtual address. + */ + if (txwin->status == VAS_WIN_ACTIVE) { + paste_addr = cp_inst->coproc->vops->paste_addr(txwin); + if (paste_addr) { + fault = vmf_insert_pfn(vma, vma->vm_start, + (paste_addr >> PAGE_SHIFT)); + mutex_unlock(&txwin->task_ref.mmap_mutex); + return fault; + } + } + mutex_unlock(&txwin->task_ref.mmap_mutex); + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct vas_vm_ops = { + .fault = vas_mmap_fault, +}; + static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) { struct coproc_instance *cp_inst = fp->private_data; @@ -398,6 +463,9 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) pr_devel("%s(): paste addr %llx at %lx, rc %d\n", __func__, paste_addr, vma->vm_start, rc); + txwin->task_ref.vma = vma; + vma->vm_ops = &vas_vm_ops; + return rc; } -- cgit v1.2.3 From b5c63d90cc2de8ac6724fec84d1d72cfebcae41d Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Mon, 28 Feb 2022 17:13:15 -0800 Subject: powerpc/vas: Return paste instruction failure if no active window The VAS window may not be active if the system looses credits and the NX generates page fault when it receives request on unmap paste address. The kernel handles the fault by remap new paste address if the window is active again, Otherwise return the paste instruction failure if the executed instruction that caused the fault was a paste. Signed-off-by: Nicholas Piggin Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/492b9aefd593061d51dda67ee4d2fc449c000dce.camel@linux.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 2 ++ arch/powerpc/platforms/book3s/vas-api.c | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 9675303b724e..82f1f0041c6f 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -262,6 +262,8 @@ #define PPC_INST_MFSPR_PVR 0x7c1f42a6 #define PPC_INST_MFSPR_PVR_MASK 0xfc1ffffe #define PPC_INST_MTMSRD 0x7c000164 +#define PPC_INST_PASTE 0x7c20070d +#define PPC_INST_PASTE_MASK 0xfc2007ff #define PPC_INST_POPCNTB 0x7c0000f4 #define PPC_INST_POPCNTB_MASK 0xfc0007fe #define PPC_INST_RFEBB 0x4c000124 diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 217b4a624d09..82f32781c5d2 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -351,6 +351,41 @@ static int coproc_release(struct inode *inode, struct file *fp) return 0; } +/* + * If the executed instruction that caused the fault was a paste, then + * clear regs CR0[EQ], advance NIP, and return 0. Else return error code. + */ +static int do_fail_paste(void) +{ + struct pt_regs *regs = current->thread.regs; + u32 instword; + + if (WARN_ON_ONCE(!regs)) + return -EINVAL; + + if (WARN_ON_ONCE(!user_mode(regs))) + return -EINVAL; + + /* + * If we couldn't translate the instruction, the driver should + * return success without handling the fault, it will be retried + * or the instruction fetch will fault. + */ + if (get_user(instword, (u32 __user *)(regs->nip))) + return -EAGAIN; + + /* + * Not a paste instruction, driver may fail the fault. + */ + if ((instword & PPC_INST_PASTE_MASK) != PPC_INST_PASTE) + return -ENOENT; + + regs->ccr &= ~0xe0000000; /* Clear CR0[0-2] to fail paste */ + regs_add_return_ip(regs, 4); /* Emulate the paste */ + + return 0; +} + /* * This fault handler is invoked when the core generates page fault on * the paste address. Happens if the kernel closes window in hypervisor @@ -364,6 +399,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) struct vas_window *txwin; vm_fault_t fault; u64 paste_addr; + int ret; /* * window is not opened. Shouldn't expect this error. @@ -408,6 +444,24 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) } mutex_unlock(&txwin->task_ref.mmap_mutex); + /* + * Received this fault due to closing the actual window. + * It can happen during migration or lost credits. + * Since no mapping, return the paste instruction failure + * to the user space. + */ + ret = do_fail_paste(); + /* + * The user space can retry several times until success (needed + * for migration) or should fallback to SW compression or + * manage with the existing open windows if available. + * Looking at sysfs interface, it can determine whether these + * failures are coming during migration or core removal: + * nr_used_credits > nr_total_credits when lost credits + */ + if (!ret || (ret == -EAGAIN)) + return VM_FAULT_NOPAGE; + return VM_FAULT_SIGBUS; } -- cgit v1.2.3 From 8ef7b9e1765a52c8023d9133a2438ac9f6da486a Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Mon, 28 Feb 2022 17:14:28 -0800 Subject: powerpc/pseries/vas: Close windows with DLPAR core removal The hypervisor assigns vas credits (windows) for each LPAR based on the number of cores configured in that system. The OS is expected to release credits when cores are removed, and may allocate more when cores are added. So there is a possibility of using excessive credits (windows) in the LPAR and the hypervisor expects the system to close the excessive windows so that NX load can be equally distributed across all LPARs in the system. When the OS closes the excessive windows in the hypervisor, it sets the window status inactive and invalidates window virtual address mapping. The user space receives paste instruction failure if any NX requests are issued on the inactive window. Then the user space can use with the available open windows or retry NX requests until this window active again. This patch also adds the notifier for core removal/add to close windows in the hypervisor if the system lost credits (core removal) and reopen windows in the hypervisor when the previously lost credits are available. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/108928f9c00a48cc6a722315d482d07cf66acf5a.camel@linux.ibm.com --- arch/powerpc/include/asm/vas.h | 2 + arch/powerpc/platforms/pseries/vas.c | 207 +++++++++++++++++++++++++++++++++-- arch/powerpc/platforms/pseries/vas.h | 3 + 3 files changed, 204 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index 27251af18c65..6baf7b9ffed4 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -34,6 +34,8 @@ */ #define VAS_WIN_ACTIVE 0x0 /* Used in platform independent */ /* vas mmap() */ +/* Window is closed in the hypervisor due to lost credit */ +#define VAS_WIN_NO_CRED_CLOSE 0x00000001 /* * Get/Set bit fields diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 1035446f985b..a297720bcdae 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -370,13 +370,28 @@ static struct vas_window *vas_allocate_window(int vas_id, u64 flags, if (rc) goto out_free; - vas_user_win_add_mm_context(&txwin->vas_win.task_ref); txwin->win_type = cop_feat_caps->win_type; mutex_lock(&vas_pseries_mutex); - list_add(&txwin->win_list, &caps->list); + /* + * Possible to lose the acquired credit with DLPAR core + * removal after the window is opened. So if there are any + * closed windows (means with lost credits), do not give new + * window to user space. New windows will be opened only + * after the existing windows are reopened when credits are + * available. + */ + if (!caps->nr_close_wins) { + list_add(&txwin->win_list, &caps->list); + caps->nr_open_windows++; + mutex_unlock(&vas_pseries_mutex); + vas_user_win_add_mm_context(&txwin->vas_win.task_ref); + return &txwin->vas_win; + } mutex_unlock(&vas_pseries_mutex); - return &txwin->vas_win; + put_vas_user_win_ref(&txwin->vas_win.task_ref); + rc = -EBUSY; + pr_err("No credit is available to allocate window\n"); out_free: /* @@ -439,14 +454,24 @@ static int vas_deallocate_window(struct vas_window *vwin) caps = &vascaps[win->win_type].caps; mutex_lock(&vas_pseries_mutex); - rc = deallocate_free_window(win); - if (rc) { - mutex_unlock(&vas_pseries_mutex); - return rc; - } + /* + * VAS window is already closed in the hypervisor when + * lost the credit. So just remove the entry from + * the list, remove task references and free vas_window + * struct. + */ + if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) { + rc = deallocate_free_window(win); + if (rc) { + mutex_unlock(&vas_pseries_mutex); + return rc; + } + } else + vascaps[win->win_type].nr_close_wins--; list_del(&win->win_list); atomic_dec(&caps->nr_used_credits); + vascaps[win->win_type].nr_open_windows--; mutex_unlock(&vas_pseries_mutex); put_vas_user_win_ref(&vwin->task_ref); @@ -501,6 +526,7 @@ static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, memset(vcaps, 0, sizeof(*vcaps)); INIT_LIST_HEAD(&vcaps->list); + vcaps->feat = feat; caps = &vcaps->caps; rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat, @@ -539,6 +565,168 @@ static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, return 0; } +/* + * The hypervisor reduces the available credits if the LPAR lost core. It + * means the excessive windows should not be active and the user space + * should not be using these windows to send compression requests to NX. + * So the kernel closes the excessive windows and unmap the paste address + * such that the user space receives paste instruction failure. Then up to + * the user space to fall back to SW compression and manage with the + * existing windows. + */ +static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds) +{ + struct pseries_vas_window *win, *tmp; + struct vas_user_win_ref *task_ref; + struct vm_area_struct *vma; + int rc = 0; + + list_for_each_entry_safe(win, tmp, &vcap->list, win_list) { + /* + * This window is already closed due to lost credit + * before. Go for next window. + */ + if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) + continue; + + task_ref = &win->vas_win.task_ref; + mutex_lock(&task_ref->mmap_mutex); + vma = task_ref->vma; + /* + * Number of available credits are reduced, So select + * and close windows. + */ + win->vas_win.status |= VAS_WIN_NO_CRED_CLOSE; + + mmap_write_lock(task_ref->mm); + /* + * vma is set in the original mapping. But this mapping + * is done with mmap() after the window is opened with ioctl. + * so we may not see the original mapping if the core remove + * is done before the original mmap() and after the ioctl. + */ + if (vma) + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + + mmap_write_unlock(task_ref->mm); + mutex_unlock(&task_ref->mmap_mutex); + /* + * Close VAS window in the hypervisor, but do not + * free vas_window struct since it may be reused + * when the credit is available later (DLPAR with + * adding cores). This struct will be used + * later when the process issued with close(FD). + */ + rc = deallocate_free_window(win); + if (rc) + return rc; + + vcap->nr_close_wins++; + + if (!--excess_creds) + break; + } + + return 0; +} + +/* + * Get new VAS capabilities when the core add/removal configuration + * changes. Reconfig window configurations based on the credits + * availability from this new capabilities. + */ +static int vas_reconfig_capabilties(u8 type) +{ + struct hv_vas_cop_feat_caps *hv_caps; + struct vas_cop_feat_caps *caps; + int old_nr_creds, new_nr_creds; + struct vas_caps *vcaps; + int rc = 0, nr_active_wins; + + if (type >= VAS_MAX_FEAT_TYPE) { + pr_err("Invalid credit type %d\n", type); + return -EINVAL; + } + + vcaps = &vascaps[type]; + caps = &vcaps->caps; + + hv_caps = kmalloc(sizeof(*hv_caps), GFP_KERNEL); + if (!hv_caps) + return -ENOMEM; + + mutex_lock(&vas_pseries_mutex); + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat, + (u64)virt_to_phys(hv_caps)); + if (rc) + goto out; + + new_nr_creds = be16_to_cpu(hv_caps->target_lpar_creds); + + old_nr_creds = atomic_read(&caps->nr_total_credits); + + atomic_set(&caps->nr_total_credits, new_nr_creds); + /* + * The total number of available credits may be decreased or + * inceased with DLPAR operation. Means some windows have to be + * closed / reopened. Hold the vas_pseries_mutex so that the + * the user space can not open new windows. + */ + if (old_nr_creds > new_nr_creds) { + /* + * # active windows is more than new LPAR available + * credits. So close the excessive windows. + * On pseries, each window will have 1 credit. + */ + nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins; + if (nr_active_wins > new_nr_creds) + rc = reconfig_close_windows(vcaps, + nr_active_wins - new_nr_creds); + } + +out: + mutex_unlock(&vas_pseries_mutex); + kfree(hv_caps); + return rc; +} +/* + * Total number of default credits available (target_credits) + * in LPAR depends on number of cores configured. It varies based on + * whether processors are in shared mode or dedicated mode. + * Get the notifier when CPU configuration is changed with DLPAR + * operation so that get the new target_credits (vas default capabilities) + * and then update the existing windows usage if needed. + */ +static int pseries_vas_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct of_reconfig_data *rd = data; + struct device_node *dn = rd->dn; + const __be32 *intserv = NULL; + int len, rc = 0; + + if ((action == OF_RECONFIG_ATTACH_NODE) || + (action == OF_RECONFIG_DETACH_NODE)) + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", + &len); + /* + * Processor config is not changed + */ + if (!intserv) + return NOTIFY_OK; + + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE); + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + +static struct notifier_block pseries_vas_nb = { + .notifier_call = pseries_vas_notifier, +}; + static int __init pseries_vas_init(void) { struct hv_vas_cop_feat_caps *hv_cop_caps; @@ -592,6 +780,9 @@ static int __init pseries_vas_init(void) goto out_cop; } + if (copypaste_feat && firmware_has_feature(FW_FEATURE_LPAR)) + of_reconfig_notifier_register(&pseries_vas_nb); + pr_info("GZIP feature is available\n"); out_cop: diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 2872532ed72a..701363cfd7c1 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -83,6 +83,9 @@ struct vas_cop_feat_caps { struct vas_caps { struct vas_cop_feat_caps caps; struct list_head list; /* List of open windows */ + int nr_close_wins; /* closed windows in the hypervisor for DLPAR */ + int nr_open_windows; /* Number of successful open windows */ + u8 feat; /* Feature type */ }; /* -- cgit v1.2.3 From 716d7a2e3764cb79061371767bff1a691adb4e7f Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sun, 27 Feb 2022 23:52:08 -0800 Subject: powerpc/pseries/vas: Modify reconfig open/close functions for migration VAS is a hardware engine stays on the chip. So when the partition migrates, all VAS windows on the source system have to be closed and reopen them on the destination after migration. The kernel has to consider both DLPAR CPU and migration events to take action on VAS windows. So using VAS_WIN_NO_CRED_CLOSE and VAS_WIN_MIGRATE_CLOSE status bits and windows will be reopened after migration only after both status bits are cleared. This patch make changes to the current reconfig_open/close_windows functions to support migration: - Set VAS_WIN_MIGRATE_CLOSE to the window status when closes and reopen windows with the same status during resume. - Continue to close all windows even if deallocate HCALL failed (should not happen) since no way to stop migration with the current LPM implementation. - If the DLPAR CPU event happens while migration is in progress, set VAS_WIN_NO_CRED_CLOSE to the window status. Close window happens with the first event (migration or DLPAR) and Reopen window happens only with the last event (migration or DLPAR). Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0aad580387cb58379496b4cbbd7c5596e9ea70be.camel@linux.ibm.com --- arch/powerpc/include/asm/vas.h | 2 + arch/powerpc/platforms/pseries/vas.c | 88 +++++++++++++++++++++++++++++------- 2 files changed, 73 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index 6baf7b9ffed4..83afcb6c194b 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -36,6 +36,8 @@ /* vas mmap() */ /* Window is closed in the hypervisor due to lost credit */ #define VAS_WIN_NO_CRED_CLOSE 0x00000001 +/* Window is closed due to migration */ +#define VAS_WIN_MIGRATE_CLOSE 0x00000002 /* * Get/Set bit fields diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 3bb219f54806..fbcf311da0ec 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -457,11 +457,12 @@ static int vas_deallocate_window(struct vas_window *vwin) mutex_lock(&vas_pseries_mutex); /* * VAS window is already closed in the hypervisor when - * lost the credit. So just remove the entry from - * the list, remove task references and free vas_window + * lost the credit or with migration. So just remove the entry + * from the list, remove task references and free vas_window * struct. */ - if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) { + if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && + !(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { rc = deallocate_free_window(win); if (rc) { mutex_unlock(&vas_pseries_mutex); @@ -578,12 +579,14 @@ static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, * by setting the remapping to new paste address if the window is * active. */ -static int reconfig_open_windows(struct vas_caps *vcaps, int creds) +static int reconfig_open_windows(struct vas_caps *vcaps, int creds, + bool migrate) { long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID}; struct vas_cop_feat_caps *caps = &vcaps->caps; struct pseries_vas_window *win = NULL, *tmp; int rc, mv_ents = 0; + int flag; /* * Nothing to do if there are no closed windows. @@ -602,8 +605,10 @@ static int reconfig_open_windows(struct vas_caps *vcaps, int creds) * (dedicated). If 1 core is added, this LPAR can have 20 more * credits. It means the kernel can reopen 20 windows. So move * 20 entries in the VAS windows lost and reopen next 20 windows. + * For partition migration, reopen all windows that are closed + * during resume. */ - if (vcaps->nr_close_wins > creds) + if ((vcaps->nr_close_wins > creds) && !migrate) mv_ents = vcaps->nr_close_wins - creds; list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) { @@ -613,12 +618,35 @@ static int reconfig_open_windows(struct vas_caps *vcaps, int creds) mv_ents--; } + /* + * Open windows if they are closed only with migration or + * DLPAR (lost credit) before. + */ + if (migrate) + flag = VAS_WIN_MIGRATE_CLOSE; + else + flag = VAS_WIN_NO_CRED_CLOSE; + list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) { + /* + * This window is closed with DLPAR and migration events. + * So reopen the window with the last event. + * The user space is not suspended with the current + * migration notifier. So the user space can issue DLPAR + * CPU hotplug while migration in progress. In this case + * this window will be opened with the last event. + */ + if ((win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && + (win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { + win->vas_win.status &= ~flag; + continue; + } + /* * Nothing to do on this window if it is not closed - * with VAS_WIN_NO_CRED_CLOSE + * with this flag */ - if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) + if (!(win->vas_win.status & flag)) continue; rc = allocate_setup_window(win, (u64 *)&domain[0], @@ -634,7 +662,7 @@ static int reconfig_open_windows(struct vas_caps *vcaps, int creds) /* * Set window status to active */ - win->vas_win.status &= ~VAS_WIN_NO_CRED_CLOSE; + win->vas_win.status &= ~flag; mutex_unlock(&win->vas_win.task_ref.mmap_mutex); win->win_type = caps->win_type; if (!--vcaps->nr_close_wins) @@ -661,20 +689,32 @@ out: * the user space to fall back to SW compression and manage with the * existing windows. */ -static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds) +static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, + bool migrate) { struct pseries_vas_window *win, *tmp; struct vas_user_win_ref *task_ref; struct vm_area_struct *vma; - int rc = 0; + int rc = 0, flag; + + if (migrate) + flag = VAS_WIN_MIGRATE_CLOSE; + else + flag = VAS_WIN_NO_CRED_CLOSE; list_for_each_entry_safe(win, tmp, &vcap->list, win_list) { /* * This window is already closed due to lost credit - * before. Go for next window. + * or for migration before. Go for next window. + * For migration, nothing to do since this window + * closed for DLPAR and will be reopened even on + * the destination system with other DLPAR operation. */ - if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) + if ((win->vas_win.status & VAS_WIN_MIGRATE_CLOSE) || + (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) { + win->vas_win.status |= flag; continue; + } task_ref = &win->vas_win.task_ref; mutex_lock(&task_ref->mmap_mutex); @@ -683,7 +723,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds) * Number of available credits are reduced, So select * and close windows. */ - win->vas_win.status |= VAS_WIN_NO_CRED_CLOSE; + win->vas_win.status |= flag; mmap_write_lock(task_ref->mm); /* @@ -706,12 +746,24 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds) * later when the process issued with close(FD). */ rc = deallocate_free_window(win); - if (rc) + /* + * This failure is from the hypervisor. + * No way to stop migration for these failures. + * So ignore error and continue closing other windows. + */ + if (rc && !migrate) return rc; vcap->nr_close_wins++; - if (!--excess_creds) + /* + * For migration, do not depend on lpar_creds in case if + * mismatch with the hypervisor value (should not happen). + * So close all active windows in the list and will be + * reopened windows based on the new lpar_creds on the + * destination system during resume. + */ + if (!migrate && !--excess_creds) break; } @@ -761,7 +813,8 @@ int vas_reconfig_capabilties(u8 type) * target, reopen windows if they are closed due to * the previous DLPAR (core removal). */ - rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds); + rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds, + false); } else { /* * # active windows is more than new LPAR available @@ -771,7 +824,8 @@ int vas_reconfig_capabilties(u8 type) nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins; if (nr_active_wins > new_nr_creds) rc = reconfig_close_windows(vcaps, - nr_active_wins - new_nr_creds); + nr_active_wins - new_nr_creds, + false); } out: -- cgit v1.2.3 From 4eeac2b0aaadc3d1943d348d8565f7cfb93272b9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 14 Feb 2022 16:11:40 +0530 Subject: powerpc64: Set PPC64_ELF_ABI_v[1|2] macros to 1 Set macros to 1 so that they can be used with __is_defined(). Suggested-by: Christophe Leroy Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/abad4868416ddfd42893f99c0cad8e5faf998095.1644834730.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index 97da77bc48c9..84078c28c1a2 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -13,9 +13,9 @@ #ifdef __powerpc64__ #if defined(_CALL_ELF) && _CALL_ELF == 2 -#define PPC64_ELF_ABI_v2 +#define PPC64_ELF_ABI_v2 1 #else -#define PPC64_ELF_ABI_v1 +#define PPC64_ELF_ABI_v1 1 #endif #endif /* __powerpc64__ */ -- cgit v1.2.3 From cc15ff3275694fedc33cd3d53212a43eec7aa0bc Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Thu, 20 Jan 2022 17:49:31 +0530 Subject: powerpc/mce: Avoid using irq_work_queue() in realmode In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220120121931.517974-1-ganeshgr@linux.ibm.com --- arch/powerpc/include/asm/machdep.h | 2 ++ arch/powerpc/include/asm/mce.h | 13 +++++++ arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c | 60 +++++++++++++++++--------------- arch/powerpc/kernel/time.c | 2 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 32 +---------------- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 53 insertions(+), 59 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 06ac7ef07c85..358d171ae8e0 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -94,6 +94,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool (*mce_check_early_recovery)(struct pt_regs *regs); + void (*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..c9f0936bd3c9 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void mce_irq_work_queue(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_run_irq_context_handlers(void); +#else +static inline void mce_run_irq_context_handlers(void) { }; +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void set_mce_pending_irq_work(void); +void clear_mce_pending_irq_work(void); +#endif /* CONFIG_PPC_BOOK3S_64 */ + #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 295573a82c66..8330968ca346 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -288,6 +288,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u8 mce_pending_irq_work; #endif /* CONFIG_PPC_BOOK3S_64 */ } ____cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 811c867ad6c6..671d727d06f7 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { - .func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +void mce_irq_work_queue(void) +{ + /* Raise decrementer interrupt */ + arch_irq_work_raise(); + set_mce_pending_irq_work(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -217,7 +214,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(&mce_ue_event_work); } @@ -239,7 +236,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - irq_work_queue(&mce_ue_event_irq_work); + mce_irq_work_queue(); } /* @@ -249,7 +246,6 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; - unsigned long msr; if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return; @@ -263,20 +259,7 @@ void machine_check_queue_event(void) memcpy(&local_paca->mce_info->mce_event_queue[index], &evt, sizeof(evt)); - /* - * Queue irq work to process this event later. Before - * queuing the work enable translation for non radix LPAR, - * as irq_work_queue may try to access memory outside RMO - * region. - */ - if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { - msr = mfmsr(); - mtmsr(msr | MSR_IR | MSR_DR); - irq_work_queue(&mce_event_process_work); - mtmsr(msr); - } else { - irq_work_queue(&mce_event_process_work); - } + mce_irq_work_queue(); } void mce_common_process_ue(struct pt_regs *regs, @@ -338,7 +321,7 @@ static void machine_process_ue_event(struct work_struct *work) * process pending MCE event from the mce event queue. This function will be * called during syscall exit. */ -static void machine_check_process_queued_event(struct irq_work *work) +static void machine_check_process_queued_event(void) { int index; struct machine_check_event *evt; @@ -363,6 +346,27 @@ static void machine_check_process_queued_event(struct irq_work *work) } } +void set_mce_pending_irq_work(void) +{ + local_paca->mce_pending_irq_work = 1; +} + +void clear_mce_pending_irq_work(void) +{ + local_paca->mce_pending_irq_work = 0; +} + +void mce_run_irq_context_handlers(void) +{ + if (unlikely(local_paca->mce_pending_irq_work)) { + if (ppc_md.machine_check_log_err) + ppc_md.machine_check_log_err(); + machine_check_process_queued_event(); + machine_check_ue_work(); + clear_mce_pending_irq_work(); + } +} + void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index cd0b8b71ecdd..17598bba54eb 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -70,6 +70,7 @@ #include #include #include +#include /* powerpc clocksource/clockevent code */ @@ -638,6 +639,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) if (test_irq_work_pending()) { clear_irq_work_pending(); + mce_run_irq_context_handlers(); irq_work_run(); } diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 56c9ef9052e9..af162aeeae86 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -21,6 +21,7 @@ struct pt_regs; extern int pSeries_system_reset_exception(struct pt_regs *regs); extern int pSeries_machine_check_exception(struct pt_regs *regs); extern long pseries_machine_check_realmode(struct pt_regs *regs); +void pSeries_machine_check_log_err(void); #ifdef CONFIG_SMP extern void smp_init_pseries(void); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 2a158e828c99..f12516c3998c 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock); static int ras_check_exception_token; -static void mce_process_errlog_event(struct irq_work *work); -static struct irq_work mce_errlog_process_work = { - .func = mce_process_errlog_event, -}; - #define EPOW_SENSOR_TOKEN 9 #define EPOW_SENSOR_INDEX 0 @@ -745,7 +740,6 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log = NULL; int disposition = rtas_error_disposition(errp); - unsigned long msr; u8 error_type; if (!rtas_error_extended(errp)) @@ -759,40 +753,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) error_type = mce_log->error_type; disposition = mce_handle_err_realmode(disposition, error_type); - - /* - * Enable translation as we will be accessing per-cpu variables - * in save_mce_event() which may fall outside RMO region, also - * leave it enabled because subsequently we will be queuing work - * to workqueues where again per-cpu variables accessed, besides - * fwnmi_release_errinfo() crashes when called in realmode on - * pseries. - * Note: All the realmode handling like flushing SLB entries for - * SLB multihit is done by now. - */ out: - msr = mfmsr(); - mtmsr(msr | MSR_IR | MSR_DR); - disposition = mce_handle_err_virtmode(regs, errp, mce_log, disposition); - - /* - * Queue irq work to log this rtas event later. - * irq_work_queue uses per-cpu variables, so do this in virt - * mode as well. - */ - irq_work_queue(&mce_errlog_process_work); - - mtmsr(msr); - return disposition; } /* * Process MCE rtas errlog event. */ -static void mce_process_errlog_event(struct irq_work *work) +void pSeries_machine_check_log_err(void) { struct rtas_error_log *err; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 83a04d967a59..069d7b3bb142 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -1086,6 +1086,7 @@ define_machine(pseries) { .system_reset_exception = pSeries_system_reset_exception, .machine_check_early = pseries_machine_check_realmode, .machine_check_exception = pSeries_machine_check_exception, + .machine_check_log_err = pSeries_machine_check_log_err, #ifdef CONFIG_KEXEC_CORE .machine_kexec = pSeries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, -- cgit v1.2.3 From 3c14b73454cf9f6e2146443fdfbdfb912c0efed3 Mon Sep 17 00:00:00 2001 From: "Pratik R. Sampat" Date: Thu, 17 Feb 2022 16:23:20 +0530 Subject: powerpc/pseries: Interface to represent PAPR firmware attributes Adds a syscall interface to represent the energy and frequency related PAPR attributes on the system using the new H_CALL "H_GET_ENERGY_SCALE_INFO". H_GET_EM_PARMS H_CALL was previously responsible for exporting this information in the lparcfg, however the H_GET_EM_PARMS H_CALL will be deprecated P10 onwards. The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format: hcall( uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info uint64 flags, // Per the flag request uint64 firstAttributeId,// The attribute id uint64 bufferAddress, // Guest physical address of the output buffer uint64 bufferSize // The size in bytes of the output buffer ); As specified in PAPR+ v2.11, section 14.14.3. This H_CALL can query either all the attributes at once with firstAttributeId = 0, flags = 0 as well as query only one attribute at a time with firstAttributeId = id, flags = 1. The output buffer consists of the following 1. number of attributes - 8 bytes 2. array offset to the data location - 8 bytes 3. version info - 1 byte 4. A data array of size num attributes, which contains the following: a. attribute ID - 8 bytes b. attribute value in number - 8 bytes c. attribute name in string - 64 bytes d. attribute value in string - 64 bytes The new H_CALL exports information in direct string value format, hence a new interface has been introduced in /sys/firmware/papr/energy_scale_info to export this information to userspace so that the firmware can add new values without the need for the kernel to be changed. The H_CALL returns the name, numeric value and string value (if exists) The format of exposing the sysfs information is as follows: /sys/firmware/papr/energy_scale_info/ |-- / |-- desc |-- value |-- value_desc (if exists) |-- / |-- desc |-- value |-- value_desc (if exists) ... The energy information that is exported is useful for userspace tools such as powerpc-utils. Currently these tools infer the "power_mode_data" value in the lparcfg, which in turn is obtained from the to be deprecated H_GET_EM_PARMS H_CALL. On future platforms, such userspace utilities will have to look at the data returned from the new H_CALL being populated in this new sysfs interface and report this information directly without the need of interpretation. Signed-off-by: Pratik R. Sampat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220217105321.52941-2-psampat@linux.ibm.com --- .../testing/sysfs-firmware-papr-energy-scale-info | 29 ++ arch/powerpc/include/asm/firmware.h | 4 +- arch/powerpc/include/asm/hvcall.h | 3 +- arch/powerpc/kvm/trace_hv.h | 1 + arch/powerpc/platforms/pseries/Makefile | 3 +- arch/powerpc/platforms/pseries/firmware.c | 1 + .../platforms/pseries/papr_platform_attributes.c | 361 +++++++++++++++++++++ 7 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c (limited to 'arch/powerpc/include') diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info new file mode 100644 index 000000000000..141a6b371469 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info @@ -0,0 +1,29 @@ +What: /sys/firmware/papr/energy_scale_info +Date: February 2022 +Contact: Linux for PowerPC mailing list +Description: Directory hosting a set of platform attributes like + energy/frequency on Linux running as a PAPR guest. + + Each file in a directory contains a platform + attribute hierarchy pertaining to performance/ + energy-savings mode and processor frequency. + +What: /sys/firmware/papr/energy_scale_info/ +Date: February 2022 +Contact: Linux for PowerPC mailing list +Description: Energy, frequency attributes directory for POWERVM servers + +What: /sys/firmware/papr/energy_scale_info//desc +Date: February 2022 +Contact: Linux for PowerPC mailing list +Description: String description of the energy attribute of + +What: /sys/firmware/papr/energy_scale_info//value +Date: February 2022 +Contact: Linux for PowerPC mailing list +Description: Numeric value of the energy attribute of + +What: /sys/firmware/papr/energy_scale_info//value_desc +Date: February 2022 +Contact: Linux for PowerPC mailing list +Description: String value of the energy attribute of diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 9b702d2b80fb..8dddd34b8ecf 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -54,6 +54,7 @@ #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) #define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) +#define FW_FEATURE_ENERGY_SCALE_INFO ASM_CONST(0x0000040000000000) #ifndef __ASSEMBLY__ @@ -74,7 +75,8 @@ enum { FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | - FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, + FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY | + FW_FEATURE_ENERGY_SCALE_INFO, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 9bcf345cb208..48f510ba9f4a 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -323,7 +323,8 @@ #define H_SCM_PERFORMANCE_STATS 0x418 #define H_RPT_INVALIDATE 0x448 #define H_SCM_FLUSH 0x44C -#define MAX_HCALL_OPCODE H_SCM_FLUSH +#define H_GET_ENERGY_SCALE_INFO 0x450 +#define MAX_HCALL_OPCODE H_GET_ENERGY_SCALE_INFO /* Scope args for H_SCM_UNBIND_ALL */ #define H_UNBIND_SCOPE_ALL (0x1) diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 830a126e095d..38cd0ed0a617 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -115,6 +115,7 @@ {H_VASI_STATE, "H_VASI_STATE"}, \ {H_ENABLE_CRQ, "H_ENABLE_CRQ"}, \ {H_GET_EM_PARMS, "H_GET_EM_PARMS"}, \ + {H_GET_ENERGY_SCALE_INFO, "H_GET_ENERGY_SCALE_INFO"}, \ {H_SET_MPP, "H_SET_MPP"}, \ {H_GET_MPP, "H_GET_MPP"}, \ {H_HOME_NODE_ASSOCIATIVITY, "H_HOME_NODE_ASSOCIATIVITY"}, \ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 29b522d2c755..9764e1a2ed5c 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -6,7 +6,8 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ - pci.o pci_dlpar.o eeh_pseries.o msi.o + pci.o pci_dlpar.o eeh_pseries.o msi.o \ + papr_platform_attributes.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index f162156b7b68..09c119b2f623 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -66,6 +66,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, {FW_FEATURE_PAPR_SCM, "hcall-scm"}, {FW_FEATURE_RPT_INVALIDATE, "hcall-rpt-invalidate"}, + {FW_FEATURE_ENERGY_SCALE_INFO, "hcall-energy-scale-info"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/arch/powerpc/platforms/pseries/papr_platform_attributes.c b/arch/powerpc/platforms/pseries/papr_platform_attributes.c new file mode 100644 index 000000000000..515150417bb3 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr_platform_attributes.c @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Platform energy and frequency attributes driver + * + * This driver creates a sys file at /sys/firmware/papr/ which encapsulates a + * directory structure containing files in keyword - value pairs that specify + * energy and frequency configuration of the system. + * + * The format of exposing the sysfs information is as follows: + * /sys/firmware/papr/energy_scale_info/ + * |-- / + * |-- desc + * |-- value + * |-- value_desc (if exists) + * |-- / + * |-- desc + * |-- value + * |-- value_desc (if exists) + * + * Copyright 2022 IBM Corp. + */ + +#include +#include + +#include "pseries.h" + +/* + * Flag attributes to fetch either all or one attribute from the HCALL + * flag = BE(0) => fetch all attributes with firstAttributeId = 0 + * flag = BE(1) => fetch a single attribute with firstAttributeId = id + */ +#define ESI_FLAGS_ALL 0 +#define ESI_FLAGS_SINGLE (1ull << 63) + +#define KOBJ_MAX_ATTRS 3 + +#define ESI_HDR_SIZE sizeof(struct h_energy_scale_info_hdr) +#define ESI_ATTR_SIZE sizeof(struct energy_scale_attribute) +#define CURR_MAX_ESI_ATTRS 8 + +struct energy_scale_attribute { + __be64 id; + __be64 val; + u8 desc[64]; + u8 value_desc[64]; +} __packed; + +struct h_energy_scale_info_hdr { + __be64 num_attrs; + __be64 array_offset; + u8 data_header_version; +} __packed; + +struct papr_attr { + u64 id; + struct kobj_attribute kobj_attr; +}; + +struct papr_group { + struct attribute_group pg; + struct papr_attr pgattrs[KOBJ_MAX_ATTRS]; +}; + +static struct papr_group *papr_groups; +/* /sys/firmware/papr */ +static struct kobject *papr_kobj; +/* /sys/firmware/papr/energy_scale_info */ +static struct kobject *esi_kobj; + +/* + * Energy modes can change dynamically hence making a new hcall each time the + * information needs to be retrieved + */ +static int papr_get_attr(u64 id, struct energy_scale_attribute *esi) +{ + int esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * ESI_ATTR_SIZE); + int ret, max_esi_attrs = CURR_MAX_ESI_ATTRS; + struct energy_scale_attribute *curr_esi; + struct h_energy_scale_info_hdr *hdr; + char *buf; + + buf = kmalloc(esi_buf_size, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + +retry: + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_SINGLE, + id, virt_to_phys(buf), + esi_buf_size); + + /* + * If the hcall fails with not enough memory for either the + * header or data, attempt to allocate more + */ + if (ret == H_PARTIAL || ret == H_P4) { + char *temp_buf; + + max_esi_attrs += 4; + esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * max_esi_attrs); + + temp_buf = krealloc(buf, esi_buf_size, GFP_KERNEL); + if (temp_buf) + buf = temp_buf; + else + return -ENOMEM; + + goto retry; + } + + if (ret != H_SUCCESS) { + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO"); + ret = -EIO; + goto out_buf; + } + + hdr = (struct h_energy_scale_info_hdr *) buf; + curr_esi = (struct energy_scale_attribute *) + (buf + be64_to_cpu(hdr->array_offset)); + + if (esi_buf_size < + be64_to_cpu(hdr->array_offset) + (be64_to_cpu(hdr->num_attrs) + * sizeof(struct energy_scale_attribute))) { + ret = -EIO; + goto out_buf; + } + + *esi = *curr_esi; + +out_buf: + kfree(buf); + + return ret; +} + +/* + * Extract and export the description of the energy scale attributes + */ +static ssize_t desc_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%s\n", esi.desc); +} + +/* + * Extract and export the numeric value of the energy scale attributes + */ +static ssize_t val_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%llu\n", be64_to_cpu(esi.val)); +} + +/* + * Extract and export the value description in string format of the energy + * scale attributes + */ +static ssize_t val_desc_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%s\n", esi.value_desc); +} + +static struct papr_ops_info { + const char *attr_name; + ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *kobj_attr, + char *buf); +} ops_info[KOBJ_MAX_ATTRS] = { + { "desc", desc_show }, + { "value", val_show }, + { "value_desc", val_desc_show }, +}; + +static void add_attr(u64 id, int index, struct papr_attr *attr) +{ + attr->id = id; + sysfs_attr_init(&attr->kobj_attr.attr); + attr->kobj_attr.attr.name = ops_info[index].attr_name; + attr->kobj_attr.attr.mode = 0444; + attr->kobj_attr.show = ops_info[index].show; +} + +static int add_attr_group(u64 id, struct papr_group *pg, bool show_val_desc) +{ + int i; + + for (i = 0; i < KOBJ_MAX_ATTRS; i++) { + if (!strcmp(ops_info[i].attr_name, "value_desc") && + !show_val_desc) { + continue; + } + add_attr(id, i, &pg->pgattrs[i]); + pg->pg.attrs[i] = &pg->pgattrs[i].kobj_attr.attr; + } + + return sysfs_create_group(esi_kobj, &pg->pg); +} + + +static int __init papr_init(void) +{ + int esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * ESI_ATTR_SIZE); + int ret, idx, i, max_esi_attrs = CURR_MAX_ESI_ATTRS; + struct h_energy_scale_info_hdr *esi_hdr; + struct energy_scale_attribute *esi_attrs; + uint64_t num_attrs; + char *esi_buf; + + if (!firmware_has_feature(FW_FEATURE_LPAR) || + !firmware_has_feature(FW_FEATURE_ENERGY_SCALE_INFO)) { + return -ENXIO; + } + + esi_buf = kmalloc(esi_buf_size, GFP_KERNEL); + if (esi_buf == NULL) + return -ENOMEM; + /* + * hcall( + * uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info + * uint64 flags, // Per the flag request + * uint64 firstAttributeId, // The attribute id + * uint64 bufferAddress, // Guest physical address of the output buffer + * uint64 bufferSize); // The size in bytes of the output buffer + */ +retry: + + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0, + virt_to_phys(esi_buf), esi_buf_size); + + /* + * If the hcall fails with not enough memory for either the + * header or data, attempt to allocate more + */ + if (ret == H_PARTIAL || ret == H_P4) { + char *temp_esi_buf; + + max_esi_attrs += 4; + esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * max_esi_attrs); + + temp_esi_buf = krealloc(esi_buf, esi_buf_size, GFP_KERNEL); + if (temp_esi_buf) + esi_buf = temp_esi_buf; + else + return -ENOMEM; + + goto retry; + } + + if (ret != H_SUCCESS) { + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO, ret: %d\n", ret); + goto out_free_esi_buf; + } + + esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf; + num_attrs = be64_to_cpu(esi_hdr->num_attrs); + esi_attrs = (struct energy_scale_attribute *) + (esi_buf + be64_to_cpu(esi_hdr->array_offset)); + + if (esi_buf_size < + be64_to_cpu(esi_hdr->array_offset) + + (num_attrs * sizeof(struct energy_scale_attribute))) { + goto out_free_esi_buf; + } + + papr_groups = kcalloc(num_attrs, sizeof(*papr_groups), GFP_KERNEL); + if (!papr_groups) + goto out_free_esi_buf; + + papr_kobj = kobject_create_and_add("papr", firmware_kobj); + if (!papr_kobj) { + pr_warn("kobject_create_and_add papr failed\n"); + goto out_papr_groups; + } + + esi_kobj = kobject_create_and_add("energy_scale_info", papr_kobj); + if (!esi_kobj) { + pr_warn("kobject_create_and_add energy_scale_info failed\n"); + goto out_kobj; + } + + /* Allocate the groups before registering */ + for (idx = 0; idx < num_attrs; idx++) { + papr_groups[idx].pg.attrs = kcalloc(KOBJ_MAX_ATTRS + 1, + sizeof(*papr_groups[idx].pg.attrs), + GFP_KERNEL); + if (!papr_groups[idx].pg.attrs) + goto out_pgattrs; + + papr_groups[idx].pg.name = kasprintf(GFP_KERNEL, "%lld", + be64_to_cpu(esi_attrs[idx].id)); + if (papr_groups[idx].pg.name == NULL) + goto out_pgattrs; + } + + for (idx = 0; idx < num_attrs; idx++) { + bool show_val_desc = true; + + /* Do not add the value desc attr if it does not exist */ + if (strnlen(esi_attrs[idx].value_desc, + sizeof(esi_attrs[idx].value_desc)) == 0) + show_val_desc = false; + + if (add_attr_group(be64_to_cpu(esi_attrs[idx].id), + &papr_groups[idx], + show_val_desc)) { + pr_warn("Failed to create papr attribute group %s\n", + papr_groups[idx].pg.name); + idx = num_attrs; + goto out_pgattrs; + } + } + + kfree(esi_buf); + return 0; +out_pgattrs: + for (i = 0; i < idx ; i++) { + kfree(papr_groups[i].pg.attrs); + kfree(papr_groups[i].pg.name); + } + kobject_put(esi_kobj); +out_kobj: + kobject_put(papr_kobj); +out_papr_groups: + kfree(papr_groups); +out_free_esi_buf: + kfree(esi_buf); + + return -ENOMEM; +} + +machine_device_initcall(pseries, papr_init); -- cgit v1.2.3 From e86debbbb5f89c2575110cfdce89d1820577aa94 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 4 Mar 2022 18:04:02 +0100 Subject: powerpc: Cleanup asm-prototypes.c Last call to sys_swapcontext() from ASM was removed by commit fbcee2ebe8ed ("powerpc/32: Always save non volatile GPRs at syscall entry") sys_debug_setcontext() prototype not needed anymore since commit f3675644e172 ("powerpc/syscalls: signal_{32, 64} - switch to SYSCALL_DEFINE") sys_switch_endian() prototype not needed anymore since commit 81dac8177862 ("powerpc/64: Make sys_switch_endian() traceable") Signed-off-by: Christophe Leroy [mpe: Keep _mcount() prototype to avoid modpost errors] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3ed660a585df2080ea8412ec20fbf652f5bf013a.1646413435.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-prototypes.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 41b8a1e1144a..cca69093f03d 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -57,12 +57,7 @@ int enter_vmx_ops(void); void *exit_vmx_ops(void *dest); /* signals, syscalls and interrupts */ -long sys_swapcontext(struct ucontext __user *old_ctx, - struct ucontext __user *new_ctx, - long ctx_size); #ifdef CONFIG_PPC32 -long sys_debug_setcontext(struct ucontext __user *ctx, - int ndbg, struct sig_dbg_op __user *dbg); int ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp); @@ -81,7 +76,6 @@ unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs); long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); -long sys_switch_endian(void); /* prom_init (OpenFirmware) */ unsigned long __init prom_init(unsigned long r3, unsigned long r4, -- cgit v1.2.3 From e15c703be48edc3b2f96b66d4f548dc88b44266c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 4 Mar 2022 18:04:03 +0100 Subject: powerpc/smp: Declare current_set static current_set extern not needed anymore since commit eafd825ed710 ("powerpc/64: Simplify __secondary_start paca->kstack handling") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a55eb65c9d7319f0af3c31e3f6ba36522f10003d.1646413435.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-prototypes.h | 1 - arch/powerpc/kernel/smp.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index cca69093f03d..ce4e355c96a2 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -20,7 +20,6 @@ #include /* SMP */ -extern struct task_struct *current_set[NR_CPUS]; extern struct task_struct *secondary_current; void start_secondary(void *unused); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index b7fd6a72aa76..7e30a6fe5adf 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -716,7 +716,7 @@ void smp_send_stop(void) } #endif /* CONFIG_NMI_IPI */ -struct task_struct *current_set[NR_CPUS]; +static struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { -- cgit v1.2.3 From a4abd55a2490fd6407ddb6810e41f64ebd66d3af Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 4 Mar 2022 18:04:04 +0100 Subject: powerpc/kexec: Declare kexec_paca static kexec_paca is exclusively used in kexec/core_64.c Declare it static. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/094983ee851644165b7700c73cac63cfe20596cd.1646413435.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-prototypes.h | 2 -- arch/powerpc/kexec/core_64.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index ce4e355c96a2..eede369ba94f 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -24,9 +24,7 @@ extern struct task_struct *secondary_current; void start_secondary(void *unused); /* kexec */ -struct paca_struct; struct kimage; -extern struct paca_struct kexec_paca; void kexec_copy_flush(struct kimage *image); /* pseries hcall tracing */ diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 635b5fc30b53..2d49dce129f2 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -291,7 +291,7 @@ static union thread_union kexec_stack __init_task_data = * For similar reasons to the stack above, the kexecing CPU needs to be on a * static PACA; we switch to kexec_paca. */ -struct paca_struct kexec_paca; +static struct paca_struct kexec_paca; /* Our assembly helper, in misc_64.S */ extern void kexec_sequence(void *newstack, unsigned long start, -- cgit v1.2.3 From f771b55731fc82b1e8e9ef123f6f1b8d8c92bc63 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 7 Mar 2022 13:26:25 +1100 Subject: KVM: PPC: Use KVM_CAP_PPC_AIL_MODE_3 Use KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. [1] https://lists.nongnu.org/archive/html/qemu-ppc/2022-02/msg00437.html [2] https://lists.nongnu.org/archive/html/qemu-ppc/2022-02/msg00439.html Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas [mpe: Rebase onto 93b71801a827 from kvm-ppc-cap-210 branch, add EXPORT_SYMBOL] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220222064727.2314380-4-npiggin@gmail.com --- arch/powerpc/include/asm/setup.h | 2 ++ arch/powerpc/kvm/powerpc.c | 17 +++++++++++++++++ arch/powerpc/platforms/pseries/setup.c | 13 ++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index d0d3dd531c7f..a555fb77258a 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -28,11 +28,13 @@ void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 #ifdef CONFIG_PPC_PSERIES +extern bool pseries_reloc_on_exception(void); extern bool pseries_enable_reloc_on_exc(void); extern void pseries_disable_reloc_on_exc(void); extern void pseries_big_endian_exceptions(void); void __init pseries_little_endian_exceptions(void); #else +static inline bool pseries_reloc_on_exception(void) { return false; } static inline bool pseries_enable_reloc_on_exc(void) { return false; } static inline void pseries_disable_reloc_on_exc(void) {} static inline void pseries_big_endian_exceptions(void) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9772b176e406..875c30c12db0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -705,6 +705,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_AIL_MODE_3: + r = 0; + /* + * KVM PR, POWER7, and some POWER9s don't support AIL=3 mode. + * The POWER9s can support it if the guest runs in hash mode, + * but QEMU doesn't necessarily query the capability in time. + */ + if (hv_enabled) { + if (kvmhv_on_pseries()) { + if (pseries_reloc_on_exception()) + r = 1; + } else if (cpu_has_feature(CPU_FTR_ARCH_207S) && + !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { + r = 1; + } + } + break; default: r = 0; break; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 83a04d967a59..5bdbbe2151b1 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -353,6 +353,14 @@ static void pseries_lpar_idle(void) pseries_idle_epilog(); } +static bool pseries_reloc_on_exception_enabled; + +bool pseries_reloc_on_exception(void) +{ + return pseries_reloc_on_exception_enabled; +} +EXPORT_SYMBOL_GPL(pseries_reloc_on_exception); + /* * Enable relocation on during exceptions. This has partition wide scope and * may take a while to complete, if it takes longer than one second we will @@ -377,6 +385,7 @@ bool pseries_enable_reloc_on_exc(void) " on exceptions: %ld\n", rc); return false; } + pseries_reloc_on_exception_enabled = true; return true; } @@ -404,7 +413,9 @@ void pseries_disable_reloc_on_exc(void) break; mdelay(get_longbusy_msecs(rc)); } - if (rc != H_SUCCESS) + if (rc == H_SUCCESS) + pseries_reloc_on_exception_enabled = false; + else pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n", rc); } -- cgit v1.2.3 From 76222808fc253cb9ea66c3e0e8d1946933f25b70 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 4 Mar 2022 18:04:05 +0100 Subject: powerpc: Move C prototypes out of asm-prototypes.h We originally added asm-prototypes.h in commit 42f5b4cacd78 ("powerpc: Introduce asm-prototypes.h"). It's purpose was for prototypes of C functions that are only called from asm, in order to fix sparse warnings about missing prototypes. A few months later Nick added a different use case in commit 4efca4ed05cb ("kbuild: modversions for EXPORT_SYMBOL() for asm") for C prototypes for exported asm functions. This is basically the inverse of our original usage. Since then we've added various prototypes to asm-prototypes.h for both reasons, meaning we now need to unstitch it all. Dispatch prototypes of C functions into relevant headers and keep only the prototypes for functions defined in assembly. For the time being, leave prom_init() there because moving it into asm/prom.h or asm/setup.h conflicts with drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowrom.o This will be fixed later by untaggling asm/pci.h and asm/prom.h or by renaming the function in shadowrom.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/62d46904eca74042097acf4cb12c175e3067f3d1.1646413435.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-prototypes.h | 49 ----------------------- arch/powerpc/include/asm/ftrace.h | 3 ++ arch/powerpc/include/asm/hvcall.h | 5 +++ arch/powerpc/include/asm/interrupt.h | 11 +++++ arch/powerpc/include/asm/kexec.h | 2 + arch/powerpc/include/asm/processor.h | 8 ++++ arch/powerpc/include/asm/setup.h | 7 ++++ arch/powerpc/include/asm/smp.h | 3 ++ arch/powerpc/include/asm/syscalls.h | 4 ++ arch/powerpc/kernel/early_32.c | 1 - arch/powerpc/kernel/interrupt.c | 1 - arch/powerpc/kernel/irq.c | 1 - arch/powerpc/kernel/mce.c | 1 - arch/powerpc/kernel/ptrace/ptrace.c | 1 - arch/powerpc/kernel/setup_64.c | 1 - arch/powerpc/kernel/smp.c | 1 - arch/powerpc/kernel/syscalls.c | 1 - arch/powerpc/kernel/tau_6xx.c | 1 - arch/powerpc/kernel/time.c | 1 - arch/powerpc/kernel/trace/ftrace.c | 1 - arch/powerpc/kexec/core_64.c | 1 - arch/powerpc/kvm/book3s_hv_builtin.c | 1 - arch/powerpc/kvm/book3s_hv_rm_xive.c | 1 - arch/powerpc/lib/vmx-helper.c | 1 - arch/powerpc/mm/book3s64/slb.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/platforms/powernv/idle.c | 1 - arch/powerpc/platforms/powernv/opal-tracepoints.c | 1 - arch/powerpc/platforms/pseries/lpar.c | 1 - 29 files changed, 43 insertions(+), 69 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index eede369ba94f..d995c65d18ab 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -19,19 +19,6 @@ #include -/* SMP */ -extern struct task_struct *secondary_current; -void start_secondary(void *unused); - -/* kexec */ -struct kimage; -void kexec_copy_flush(struct kimage *image); - -/* pseries hcall tracing */ -extern struct static_key hcall_tracepoint_key; -void __trace_hcall_entry(unsigned long opcode, unsigned long *args); -void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); - /* Ultravisor */ #if defined(CONFIG_PPC_POWERNV) || defined(CONFIG_PPC_SVM) long ucall_norets(unsigned long opcode, ...); @@ -47,43 +34,12 @@ int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode, uint64_t msr); -/* VMX copying */ -int enter_vmx_usercopy(void); -int exit_vmx_usercopy(void); -int enter_vmx_ops(void); -void *exit_vmx_ops(void *dest); - -/* signals, syscalls and interrupts */ -#ifdef CONFIG_PPC32 -int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, - struct __kernel_old_timeval __user *tvp); -unsigned long __init early_init(unsigned long dt_ptr); -void __init machine_init(u64 dt_ptr); -#endif -long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs); -notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs, long scv); -notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs); -notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs); -#ifdef CONFIG_PPC64 -unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs); -unsigned long interrupt_exit_user_restart(struct pt_regs *regs); -unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs); -#endif - -long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, - u32 len_high, u32 len_low); - /* prom_init (OpenFirmware) */ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp, unsigned long r6, unsigned long r7, unsigned long kbase); -/* setup */ -void __init early_setup(unsigned long dt_ptr); -void early_setup_secondary(void); - /* misc runtime */ extern u64 __bswapdi2(u64); extern s64 __lshrdi3(s64, int); @@ -94,11 +50,6 @@ extern int __ucmpdi2(u64, u64); /* tracing */ void _mcount(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, - unsigned long sp); - -void pnv_power9_force_smt4_catch(void); -void pnv_power9_force_smt4_release(void); /* Transaction memory related */ void tm_enable(void); diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index ff034ae4e472..d83758acd1c7 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -19,6 +19,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp); + struct dyn_arch_ftrace { struct module *mod; }; diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 48f510ba9f4a..d92a20a85395 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -501,6 +501,11 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); +/* pseries hcall tracing */ +extern struct static_key hcall_tracepoint_key; +void __trace_hcall_entry(unsigned long opcode, unsigned long *args); +void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); + struct hvcall_mpp_data { unsigned long entitled_mem; unsigned long mapped_mem; diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index f3b2c93a5db1..f964ef5c57d8 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -636,6 +636,17 @@ static inline void interrupt_cond_local_irq_enable(struct pt_regs *regs) local_irq_enable(); } +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, + unsigned long r0, struct pt_regs *regs); +notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs, long scv); +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs); +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs); +#ifdef CONFIG_PPC64 +unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs); +unsigned long interrupt_exit_user_restart(struct pt_regs *regs); +unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_INTERRUPT_H */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 8ebdd23d987c..2aefe14e1442 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -96,6 +96,8 @@ static inline bool kdump_in_progress(void) void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer, unsigned long start_address) __noreturn; +void kexec_copy_flush(struct kimage *image); + #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_elf64_ops; diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 2c8686d9e964..39c25021030f 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -411,6 +411,8 @@ extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); extern void arch300_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask); +void pnv_power9_force_smt4_catch(void); +void pnv_power9_force_smt4_release(void); extern int fix_alignment(struct pt_regs *); @@ -427,6 +429,12 @@ extern int fix_alignment(struct pt_regs *); int do_mathemu(struct pt_regs *regs); +/* VMX copying */ +int enter_vmx_usercopy(void); +int exit_vmx_usercopy(void); +int enter_vmx_ops(void); +void *exit_vmx_ops(void *dest); + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index d0d3dd531c7f..049ca26893e6 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -76,6 +76,13 @@ static inline void setup_spectre_v2(void) {} #endif void __init do_btb_flush_fixups(void); +#ifdef CONFIG_PPC32 +unsigned long __init early_init(unsigned long dt_ptr); +void __init machine_init(u64 dt_ptr); +#endif +void __init early_setup(unsigned long dt_ptr); +void early_setup_secondary(void); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 007332a4a732..60ab739a5e3b 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -60,6 +60,9 @@ struct smp_ops_t { #endif }; +extern struct task_struct *secondary_current; + +void start_secondary(void *unused); extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern void smp_send_debugger_break(void); diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 7ee66ae5444d..a2b13e55254f 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -18,6 +18,10 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len, unsigned long fd, unsigned long pgoff); asmlinkage long ppc64_personality(unsigned long personality); asmlinkage long sys_rtas(struct rtas_args __user *uargs); +int ppc_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct __kernel_old_timeval __user *tvp); +long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, + u32 len_high, u32 len_low); #ifdef CONFIG_COMPAT unsigned long compat_sys_mmap2(unsigned long addr, size_t len, diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c index ef2ad4945904..03f1135ef64f 100644 --- a/arch/powerpc/kernel/early_32.c +++ b/arch/powerpc/kernel/early_32.c @@ -8,7 +8,6 @@ #include #include #include -#include /* * We're called here very early in the boot. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 7cd6ce3ec423..784ea3289c84 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -5,7 +5,6 @@ #include #include /* for show_regs */ -#include #include #include #include diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2cf31a97126c..752fb182eacb 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 671d727d06f7..18173199b79d 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "setup.h" diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 1212a812a7ab..55742ef1f991 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -22,7 +22,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index be8577ac9397..e547066a06aa 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -67,7 +67,6 @@ #include #include #include -#include #include "setup.h" diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 7e30a6fe5adf..de0f6f09a5dd 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 825931e400df..c4f5b4ce926f 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -35,7 +35,6 @@ #include #include #include -#include static inline long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index 8e83d19fe8fa..828d0f4106d2 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "setup.h" diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 17598bba54eb..958e2929776f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -69,7 +69,6 @@ #include #include #include -#include #include /* powerpc clocksource/clockevent code */ diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index f21b8fbd418e..4ee04aacf9f1 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 2d49dce129f2..6cc7793b8420 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7d6d91338c3f..7e52d0beee77 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c index 6f18632e30e9..dd9880731bd6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index 62e6c3045252..f76a50291fd7 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -9,7 +9,6 @@ #include #include #include -#include int enter_vmx_usercopy(void) { diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 31f4cef3adac..81091b9587f6 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -9,7 +9,6 @@ * Copyright (C) 2002 Anton Blanchard , IBM */ -#include #include #include #include diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 7ba6d3eff636..d53fed4eccbd 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 9942289f379b..a6677a111aca 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index f16a43540e30..91b36541b9e5 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -2,7 +2,6 @@ #include #include #include -#include #ifdef CONFIG_JUMP_LABEL struct static_key opal_tracepoint_key = STATIC_KEY_INIT; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f8899d506ea4..760581c5752f 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include "pseries.h" -- cgit v1.2.3 From 6b3a3e12f8e6eea47428bb39aaf58832b50bb379 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Tue, 8 Mar 2022 10:14:14 +1100 Subject: powerpc: declare unmodified attribute_group usages const Inspired by (bd75b4ef4977: Constify static attribute_group structs), accepted by linux-next, reported: https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20220210202805.7750-4-rikard.falkeborn@gmail.com/ Nearly all singletons of type struct attribute_group are never modified, and so are candidates for being const. Declare them as const. Signed-off-by: Rohan McLure Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220307231414.86560-1-rmclure@linux.ibm.com --- arch/powerpc/include/asm/spu.h | 4 ++-- arch/powerpc/perf/generic-compat-pmu.c | 4 ++-- arch/powerpc/perf/hv-24x7.c | 6 +++--- arch/powerpc/perf/hv-gpci.c | 8 ++++---- arch/powerpc/perf/imc-pmu.c | 6 +++--- arch/powerpc/perf/isa207-common.c | 2 +- arch/powerpc/perf/power10-pmu.c | 6 +++--- arch/powerpc/perf/power7-pmu.c | 4 ++-- arch/powerpc/perf/power8-pmu.c | 4 ++-- arch/powerpc/perf/power9-pmu.c | 6 +++--- arch/powerpc/platforms/cell/cbe_thermal.c | 2 +- arch/powerpc/platforms/cell/spu_base.c | 4 ++-- arch/powerpc/platforms/powernv/opal-core.c | 2 +- arch/powerpc/platforms/powernv/opal-dump.c | 2 +- arch/powerpc/platforms/powernv/opal-flash.c | 2 +- arch/powerpc/platforms/pseries/papr_scm.c | 2 +- arch/powerpc/platforms/pseries/power.c | 2 +- 17 files changed, 33 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/spu.h b/arch/powerpc/include/asm/spu.h index 8a2d11ba0dae..96ad4510c895 100644 --- a/arch/powerpc/include/asm/spu.h +++ b/arch/powerpc/include/asm/spu.h @@ -249,8 +249,8 @@ void unregister_spu_syscalls(struct spufs_calls *calls); int spu_add_dev_attr(struct device_attribute *attr); void spu_remove_dev_attr(struct device_attribute *attr); -int spu_add_dev_attr_group(struct attribute_group *attrs); -void spu_remove_dev_attr_group(struct attribute_group *attrs); +int spu_add_dev_attr_group(const struct attribute_group *attrs); +void spu_remove_dev_attr_group(const struct attribute_group *attrs); extern void notify_spus_active(void); extern void do_notify_spus_active(void); diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c index b6e25f75109d..f3db88aee4dd 100644 --- a/arch/powerpc/perf/generic-compat-pmu.c +++ b/arch/powerpc/perf/generic-compat-pmu.c @@ -130,7 +130,7 @@ static struct attribute *generic_compat_events_attr[] = { NULL }; -static struct attribute_group generic_compat_pmu_events_group = { +static const struct attribute_group generic_compat_pmu_events_group = { .name = "events", .attrs = generic_compat_events_attr, }; @@ -146,7 +146,7 @@ static struct attribute *generic_compat_pmu_format_attr[] = { NULL, }; -static struct attribute_group generic_compat_pmu_format_group = { +static const struct attribute_group generic_compat_pmu_format_group = { .name = "format", .attrs = generic_compat_pmu_format_attr, }; diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 1e8aa934e37e..12c1777187fc 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -204,7 +204,7 @@ static struct attribute *format_attrs[] = { NULL, }; -static struct attribute_group format_group = { +static const struct attribute_group format_group = { .name = "format", .attrs = format_attrs, }; @@ -1148,7 +1148,7 @@ static struct attribute *cpumask_attrs[] = { NULL, }; -static struct attribute_group cpumask_attr_group = { +static const struct attribute_group cpumask_attr_group = { .attrs = cpumask_attrs, }; @@ -1162,7 +1162,7 @@ static struct attribute *if_attrs[] = { NULL, }; -static struct attribute_group if_group = { +static const struct attribute_group if_group = { .name = "interface", .bin_attrs = if_bin_attrs, .attrs = if_attrs, diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index c756228a081f..5eb60ed5b5e8 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -65,12 +65,12 @@ static struct attribute *format_attrs[] = { NULL, }; -static struct attribute_group format_group = { +static const struct attribute_group format_group = { .name = "format", .attrs = format_attrs, }; -static struct attribute_group event_group = { +static const struct attribute_group event_group = { .name = "events", .attrs = hv_gpci_event_attrs, }; @@ -126,11 +126,11 @@ static struct attribute *cpumask_attrs[] = { NULL, }; -static struct attribute_group cpumask_attr_group = { +static const struct attribute_group cpumask_attr_group = { .attrs = cpumask_attrs, }; -static struct attribute_group interface_group = { +static const struct attribute_group interface_group = { .name = "interface", .attrs = interface_attrs, }; diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index e7583fbcc8fa..526d4b767534 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -71,7 +71,7 @@ static struct attribute *imc_format_attrs[] = { NULL, }; -static struct attribute_group imc_format_group = { +static const struct attribute_group imc_format_group = { .name = "format", .attrs = imc_format_attrs, }; @@ -90,7 +90,7 @@ static struct attribute *trace_imc_format_attrs[] = { NULL, }; -static struct attribute_group trace_imc_format_group = { +static const struct attribute_group trace_imc_format_group = { .name = "format", .attrs = trace_imc_format_attrs, }; @@ -125,7 +125,7 @@ static struct attribute *imc_pmu_cpumask_attrs[] = { NULL, }; -static struct attribute_group imc_pmu_cpumask_attr_group = { +static const struct attribute_group imc_pmu_cpumask_attr_group = { .attrs = imc_pmu_cpumask_attrs, }; diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 4037ea652522..a74d382ecbb7 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -37,7 +37,7 @@ static struct attribute *isa207_pmu_format_attr[] = { NULL, }; -struct attribute_group isa207_pmu_format_group = { +const struct attribute_group isa207_pmu_format_group = { .name = "format", .attrs = isa207_pmu_format_attr, }; diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index 0975ad0b42c4..d3398100a60f 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -200,12 +200,12 @@ static struct attribute *power10_events_attr[] = { NULL }; -static struct attribute_group power10_pmu_events_group_dd1 = { +static const struct attribute_group power10_pmu_events_group_dd1 = { .name = "events", .attrs = power10_events_attr_dd1, }; -static struct attribute_group power10_pmu_events_group = { +static const struct attribute_group power10_pmu_events_group = { .name = "events", .attrs = power10_events_attr, }; @@ -253,7 +253,7 @@ static struct attribute *power10_pmu_format_attr[] = { NULL, }; -static struct attribute_group power10_pmu_format_group = { +static const struct attribute_group power10_pmu_format_group = { .name = "format", .attrs = power10_pmu_format_attr, }; diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 99b5ba314ea7..a74211410b8d 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -405,7 +405,7 @@ static struct attribute *power7_events_attr[] = { NULL }; -static struct attribute_group power7_pmu_events_group = { +static const struct attribute_group power7_pmu_events_group = { .name = "events", .attrs = power7_events_attr, }; @@ -417,7 +417,7 @@ static struct attribute *power7_pmu_format_attr[] = { NULL, }; -static struct attribute_group power7_pmu_format_group = { +static const struct attribute_group power7_pmu_format_group = { .name = "format", .attrs = power7_pmu_format_attr, }; diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index f21194b5604a..e37b1e714d2b 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -92,7 +92,7 @@ enum { */ /* PowerISA v2.07 format attribute structure*/ -extern struct attribute_group isa207_pmu_format_group; +extern const struct attribute_group isa207_pmu_format_group; /* Table of alternatives, sorted by column 0 */ static const unsigned int event_alternatives[][MAX_ALT] = { @@ -182,7 +182,7 @@ static struct attribute *power8_events_attr[] = { NULL }; -static struct attribute_group power8_pmu_events_group = { +static const struct attribute_group power8_pmu_events_group = { .name = "events", .attrs = power8_events_attr, }; diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 4b7c17e36100..c9eb5232e68b 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -96,7 +96,7 @@ extern u64 PERF_REG_EXTENDED_MASK; #define PVR_POWER9_CUMULUS 0x00002000 /* PowerISA v2.07 format attribute structure*/ -extern struct attribute_group isa207_pmu_format_group; +extern const struct attribute_group isa207_pmu_format_group; int p9_dd21_bl_ev[] = { PM_MRK_ST_DONE_L2, @@ -217,7 +217,7 @@ static struct attribute *power9_events_attr[] = { NULL }; -static struct attribute_group power9_pmu_events_group = { +static const struct attribute_group power9_pmu_events_group = { .name = "events", .attrs = power9_events_attr, }; @@ -253,7 +253,7 @@ static struct attribute *power9_pmu_format_attr[] = { NULL, }; -static struct attribute_group power9_pmu_format_group = { +static const struct attribute_group power9_pmu_format_group = { .name = "format", .attrs = power9_pmu_format_attr, }; diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c index 2ece77f49bc3..abb5e527b4db 100644 --- a/arch/powerpc/platforms/cell/cbe_thermal.c +++ b/arch/powerpc/platforms/cell/cbe_thermal.c @@ -255,7 +255,7 @@ static struct attribute *spu_attributes[] = { NULL, }; -static struct attribute_group spu_attribute_group = { +static const struct attribute_group spu_attribute_group = { .name = "thermal", .attrs = spu_attributes, }; diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 83cea9e7ee72..2eecba3345c3 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -490,7 +490,7 @@ int spu_add_dev_attr(struct device_attribute *attr) } EXPORT_SYMBOL_GPL(spu_add_dev_attr); -int spu_add_dev_attr_group(struct attribute_group *attrs) +int spu_add_dev_attr_group(const struct attribute_group *attrs) { struct spu *spu; int rc = 0; @@ -529,7 +529,7 @@ void spu_remove_dev_attr(struct device_attribute *attr) } EXPORT_SYMBOL_GPL(spu_remove_dev_attr); -void spu_remove_dev_attr_group(struct attribute_group *attrs) +void spu_remove_dev_attr_group(const struct attribute_group *attrs) { struct spu *spu; diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 0331f1973f0e..b97bc179f65a 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -603,7 +603,7 @@ static struct bin_attribute *mpipl_bin_attr[] = { }; -static struct attribute_group mpipl_group = { +static const struct attribute_group mpipl_group = { .attrs = mpipl_attr, .bin_attrs = mpipl_bin_attr, }; diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 410ed5b9de29..16c5860f1372 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -150,7 +150,7 @@ static struct attribute *initiate_attrs[] = { NULL, }; -static struct attribute_group initiate_attr_group = { +static const struct attribute_group initiate_attr_group = { .attrs = initiate_attrs, }; diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index 7e7d38b17420..18481a8c52fa 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -512,7 +512,7 @@ static struct attribute *image_op_attrs[] = { NULL /* need to NULL terminate the list of attributes */ }; -static struct attribute_group image_op_attr_group = { +static const struct attribute_group image_op_attr_group = { .attrs = image_op_attrs, }; diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 20aafd387840..1238b94b3cc1 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -1039,7 +1039,7 @@ static struct attribute *papr_nd_attributes[] = { NULL, }; -static struct attribute_group papr_nd_attribute_group = { +static const struct attribute_group papr_nd_attribute_group = { .name = "papr", .is_visible = papr_nd_attribute_visible, .attrs = papr_nd_attributes, diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c index ee343ec6ab94..3676cb297767 100644 --- a/arch/powerpc/platforms/pseries/power.c +++ b/arch/powerpc/platforms/pseries/power.c @@ -51,7 +51,7 @@ static struct attribute *g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.2.3 From 0b0057cc4193c7cd9c0829a440e4901b29ce4ff8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 09:51:32 +0100 Subject: powerpc/bitops: Force inlining of fls() Building a kernel with CONFIG_CC_OPTIMISE_FOR_SIZE leads to the following functions being copied several times in vmlinux: 31 times __ilog2_u32() 34 times fls() Disassembly follows: c00f476c : c00f476c: 7c 63 00 34 cntlzw r3,r3 c00f4770: 20 63 00 20 subfic r3,r3,32 c00f4774: 4e 80 00 20 blr c00f4778 <__ilog2_u32>: c00f4778: 94 21 ff f0 stwu r1,-16(r1) c00f477c: 7c 08 02 a6 mflr r0 c00f4780: 90 01 00 14 stw r0,20(r1) c00f4784: 4b ff ff e9 bl c00f476c c00f4788: 80 01 00 14 lwz r0,20(r1) c00f478c: 38 63 ff ff addi r3,r3,-1 c00f4790: 7c 08 03 a6 mtlr r0 c00f4794: 38 21 00 10 addi r1,r1,16 c00f4798: 4e 80 00 20 blr When forcing inlining of fls(), we get c0008b80 <__ilog2_u32>: c0008b80: 7c 63 00 34 cntlzw r3,r3 c0008b84: 20 63 00 1f subfic r3,r3,31 c0008b88: 4e 80 00 20 blr vmlinux size gets reduced by 1 kbyte with that change. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/adc9c9d6378f6b5008246ca717993d7870188efb.1644569473.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index ea5d27dda8cf..344fba3b16eb 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -287,7 +287,7 @@ static inline void arch___clear_bit_unlock(int nr, volatile unsigned long *addr) * fls: find last (most-significant) bit set. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int x) +static __always_inline int fls(unsigned int x) { int lz; @@ -305,7 +305,7 @@ static inline int fls(unsigned int x) * 32-bit fls calls. */ #ifdef CONFIG_PPC64 -static inline int fls64(__u64 x) +static __always_inline int fls64(__u64 x) { int lz; -- cgit v1.2.3 From 792993919349fefba20f58ae4843c80e8b01f518 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 15:16:51 +0100 Subject: powerpc/64: Force inlining of prevent_user_access() and set_kuap() A ppc64_defconfig build exhibits about 10 copied of prevent_user_access(). It also have one copy of set_kuap(). c000000000017340 <.prevent_user_access.constprop.0>: c00000000001a038: 4b ff d3 09 bl c000000000017340 <.prevent_user_access.constprop.0> c00000000001aabc: 4b ff c8 85 bl c000000000017340 <.prevent_user_access.constprop.0> c00000000001ab38: 4b ff c8 09 bl c000000000017340 <.prevent_user_access.constprop.0> c00000000001ade0: 4b ff c5 61 bl c000000000017340 <.prevent_user_access.constprop.0> c000000000039b90 <.prevent_user_access.constprop.0>: c00000000003ac08: 4b ff ef 89 bl c000000000039b90 <.prevent_user_access.constprop.0> c00000000003b9d0: 4b ff e1 c1 bl c000000000039b90 <.prevent_user_access.constprop.0> c00000000003ba54: 4b ff e1 3d bl c000000000039b90 <.prevent_user_access.constprop.0> c00000000003bbfc: 4b ff df 95 bl c000000000039b90 <.prevent_user_access.constprop.0> c00000000015dde0 <.prevent_user_access.constprop.0>: c0000000001612c0: 4b ff cb 21 bl c00000000015dde0 <.prevent_user_access.constprop.0> c000000000161b54: 4b ff c2 8d bl c00000000015dde0 <.prevent_user_access.constprop.0> c000000000188cf0 <.prevent_user_access.constprop.0>: c00000000018d658: 4b ff b6 99 bl c000000000188cf0 <.prevent_user_access.constprop.0> c00000000030fe20 <.prevent_user_access.constprop.0>: c0000000003123d4: 4b ff da 4d bl c00000000030fe20 <.prevent_user_access.constprop.0> c000000000313970: 4b ff c4 b1 bl c00000000030fe20 <.prevent_user_access.constprop.0> c0000000005e6bd0 <.prevent_user_access.constprop.0>: c0000000005e7d8c: 4b ff ee 45 bl c0000000005e6bd0 <.prevent_user_access.constprop.0> c0000000007bcae0 <.prevent_user_access.constprop.0>: c0000000007bda10: 4b ff f0 d1 bl c0000000007bcae0 <.prevent_user_access.constprop.0> c0000000007bda54: 4b ff f0 8d bl c0000000007bcae0 <.prevent_user_access.constprop.0> c0000000007bdd28: 4b ff ed b9 bl c0000000007bcae0 <.prevent_user_access.constprop.0> c0000000007c0390: 4b ff c7 51 bl c0000000007bcae0 <.prevent_user_access.constprop.0> c00000000094e4f0 <.prevent_user_access.constprop.0>: c000000000950e40: 4b ff d6 b1 bl c00000000094e4f0 <.prevent_user_access.constprop.0> c00000000097d2d0 <.prevent_user_access.constprop.0>: c0000000009813fc: 4b ff be d5 bl c00000000097d2d0 <.prevent_user_access.constprop.0> c000000000acd540 <.prevent_user_access.constprop.0>: c000000000ad1d60: 4b ff b7 e1 bl c000000000acd540 <.prevent_user_access.constprop.0> c000000000e5d680 <.prevent_user_access.constprop.0>: c000000000e64b60: 4b ff 8b 21 bl c000000000e5d680 <.prevent_user_access.constprop.0> c000000000e64b6c: 4b ff 8b 15 bl c000000000e5d680 <.prevent_user_access.constprop.0> c000000000e64c38: 4b ff 8a 49 bl c000000000e5d680 <.prevent_user_access.constprop.0> When building signal_64.c with -Winline the following messages appear: ./arch/powerpc/include/asm/book3s/64/kup.h:331:20: error: inlining failed in call to 'set_kuap': call is unlikely and code size would grow [-Werror=inline] ./arch/powerpc/include/asm/book3s/64/kup.h:401:20: error: inlining failed in call to 'prevent_user_access.constprop': call is unlikely and code size would grow [-Werror=inline] Those functions are used on hot pathes and have been expected to be inlined at all time. Force them inline. This patch reduces the kernel text size by 700 bytes, confirming that not inlining those functions is not worth it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/eff9b2b211957fa2e8707e46f31674097fd563a3.1644588972.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/kup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 69fcf63eec94..54cf46808157 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -328,7 +328,7 @@ static inline unsigned long get_kuap(void) return mfspr(SPRN_AMR); } -static inline void set_kuap(unsigned long value) +static __always_inline void set_kuap(unsigned long value) { if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) return; @@ -398,7 +398,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user #endif /* !CONFIG_PPC_KUAP */ -static inline void prevent_user_access(unsigned long dir) +static __always_inline void prevent_user_access(unsigned long dir) { set_kuap(AMR_KUAP_BLOCKED); if (static_branch_unlikely(&uaccess_flush_key)) -- cgit v1.2.3 From 4c08d4bbc089a95f3f38389c2b79dbc6ab24f10b Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 25 Feb 2022 20:00:23 +0530 Subject: powerpc/papr_scm: Add perf interface support Performance monitoring support for papr-scm nvdimm devices via perf interface is added which includes addition of pmu functions like add/del/read/event_init for nvdimm_pmu struture. A new parameter 'priv' in added to the pdev_archdata structure to save nvdimm_pmu device pointer, to handle the unregistering of pmu device. papr_scm_pmu_register function populates the nvdimm_pmu structure with name, capabilities, cpumask along with event handling functions. Finally the populated nvdimm_pmu structure is passed to register the pmu device. Event handling functions internally uses hcall to get events and counter data. Result in power9 machine with 2 nvdimm device: Ex: List all event by perf list command:# perf list nmem nmem0/cache_rh_cnt/ [Kernel PMU event] nmem0/cache_wh_cnt/ [Kernel PMU event] nmem0/cri_res_util/ [Kernel PMU event] nmem0/ctl_res_cnt/ [Kernel PMU event] nmem0/ctl_res_tm/ [Kernel PMU event] nmem0/fast_w_cnt/ [Kernel PMU event] nmem0/host_l_cnt/ [Kernel PMU event] nmem0/host_l_dur/ [Kernel PMU event] nmem0/host_s_cnt/ [Kernel PMU event] nmem0/host_s_dur/ [Kernel PMU event] nmem0/med_r_cnt/ [Kernel PMU event] nmem0/med_r_dur/ [Kernel PMU event] nmem0/med_w_cnt/ [Kernel PMU event] nmem0/med_w_dur/ [Kernel PMU event] nmem0/mem_life/ [Kernel PMU event] nmem0/poweron_secs/ [Kernel PMU event] ... nmem1/mem_life/ [Kernel PMU event] nmem1/poweron_secs/ [Kernel PMU event] Acked-by: Peter Zijlstra (Intel) Tested-by: Nageswara R Sastry Signed-off-by: Kajol Jain [Add numa_map_to_online_node function call to get online node id] Reported-by: Nageswara R Sastry Reviewed-by: Madhavan Srinivasan Link: https://lore.kernel.org/r/20220225143024.47947-4-kjain@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/include/asm/device.h | 5 + arch/powerpc/platforms/pseries/papr_scm.c | 225 ++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 219559d65864..47ed639f3b8f 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -48,6 +48,11 @@ struct dev_archdata { struct pdev_archdata { u64 dma_mask; + /* + * Pointer to nvdimm_pmu structure, to handle the unregistering + * of pmu device + */ + void *priv; }; #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f48e87ac89c9..4dd513d7c029 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -19,6 +19,7 @@ #include #include #include +#include #define BIND_ANY_ADDR (~0ul) @@ -68,6 +69,8 @@ #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) #define PAPR_SCM_PERF_STATS_VERSION 0x1 +#define to_nvdimm_pmu(_pmu) container_of(_pmu, struct nvdimm_pmu, pmu) + /* Struct holding a single performance metric */ struct papr_scm_perf_stat { u8 stat_id[8]; @@ -120,6 +123,9 @@ struct papr_scm_priv { /* length of the stat buffer as expected by phyp */ size_t stat_buffer_len; + + /* array to have event_code and stat_id mappings */ + char **nvdimm_events_map; }; static int papr_scm_pmem_flush(struct nd_region *nd_region, @@ -340,6 +346,218 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, return 0; } +static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, u64 *count) +{ + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct papr_scm_priv *p = (struct papr_scm_priv *)dev->driver_data; + int rc, size; + + /* Allocate request buffer enough to hold single performance stat */ + size = sizeof(struct papr_scm_perf_stats) + + sizeof(struct papr_scm_perf_stat); + + if (!p || !p->nvdimm_events_map) + return -EINVAL; + + stats = kzalloc(size, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stat = &stats->scm_statistic[0]; + memcpy(&stat->stat_id, + p->nvdimm_events_map[event->attr.config], + sizeof(stat->stat_id)); + stat->stat_val = 0; + + rc = drc_pmem_query_stats(p, stats, 1); + if (rc < 0) { + kfree(stats); + return rc; + } + + *count = be64_to_cpu(stat->stat_val); + kfree(stats); + return 0; +} + +static int papr_scm_pmu_event_init(struct perf_event *event) +{ + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + struct papr_scm_priv *p; + + if (!nd_pmu) + return -EINVAL; + + /* test the event attr type for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* it does not support event sampling mode */ + if (is_sampling_event(event)) + return -EOPNOTSUPP; + + /* no branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + p = (struct papr_scm_priv *)nd_pmu->dev->driver_data; + if (!p) + return -EINVAL; + + /* Invalid eventcode */ + if (event->attr.config == 0 || event->attr.config > 16) + return -EINVAL; + + return 0; +} + +static int papr_scm_pmu_add(struct perf_event *event, int flags) +{ + u64 count; + int rc; + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + + if (!nd_pmu) + return -EINVAL; + + if (flags & PERF_EF_START) { + rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &count); + if (rc) + return rc; + + local64_set(&event->hw.prev_count, count); + } + + return 0; +} + +static void papr_scm_pmu_read(struct perf_event *event) +{ + u64 prev, now; + int rc; + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + + if (!nd_pmu) + return; + + rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &now); + if (rc) + return; + + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void papr_scm_pmu_del(struct perf_event *event, int flags) +{ + papr_scm_pmu_read(event); +} + +static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu *nd_pmu) +{ + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + char *statid; + int index, rc, count; + u32 available_events; + + if (!p->stat_buffer_len) + return -ENOENT; + + available_events = (p->stat_buffer_len - sizeof(struct papr_scm_perf_stats)) + / sizeof(struct papr_scm_perf_stat); + + /* Allocate the buffer for phyp where stats are written */ + stats = kzalloc(p->stat_buffer_len, GFP_KERNEL); + if (!stats) { + rc = -ENOMEM; + return rc; + } + + /* Allocate memory to nvdimm_event_map */ + p->nvdimm_events_map = kcalloc(available_events, sizeof(char *), GFP_KERNEL); + if (!p->nvdimm_events_map) { + rc = -ENOMEM; + goto out_stats; + } + + /* Called to get list of events supported */ + rc = drc_pmem_query_stats(p, stats, 0); + if (rc) + goto out_nvdimm_events_map; + + for (index = 0, stat = stats->scm_statistic, count = 0; + index < available_events; index++, ++stat) { + statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL); + if (!statid) { + rc = -ENOMEM; + goto out_nvdimm_events_map; + } + + strcpy(statid, stat->stat_id); + p->nvdimm_events_map[count] = statid; + count++; + } + p->nvdimm_events_map[count] = NULL; + kfree(stats); + return 0; + +out_nvdimm_events_map: + kfree(p->nvdimm_events_map); +out_stats: + kfree(stats); + return rc; +} + +static void papr_scm_pmu_register(struct papr_scm_priv *p) +{ + struct nvdimm_pmu *nd_pmu; + int rc, nodeid; + + nd_pmu = kzalloc(sizeof(*nd_pmu), GFP_KERNEL); + if (!nd_pmu) { + rc = -ENOMEM; + goto pmu_err_print; + } + + rc = papr_scm_pmu_check_events(p, nd_pmu); + if (rc) + goto pmu_check_events_err; + + nd_pmu->pmu.task_ctx_nr = perf_invalid_context; + nd_pmu->pmu.name = nvdimm_name(p->nvdimm); + nd_pmu->pmu.event_init = papr_scm_pmu_event_init; + nd_pmu->pmu.read = papr_scm_pmu_read; + nd_pmu->pmu.add = papr_scm_pmu_add; + nd_pmu->pmu.del = papr_scm_pmu_del; + + nd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_INTERRUPT | + PERF_PMU_CAP_NO_EXCLUDE; + + /*updating the cpumask variable */ + nodeid = numa_map_to_online_node(dev_to_node(&p->pdev->dev)); + nd_pmu->arch_cpumask = *cpumask_of_node(nodeid); + + rc = register_nvdimm_pmu(nd_pmu, p->pdev); + if (rc) + goto pmu_register_err; + + /* + * Set archdata.priv value to nvdimm_pmu structure, to handle the + * unregistering of pmu device. + */ + p->pdev->archdata.priv = nd_pmu; + return; + +pmu_register_err: + kfree(p->nvdimm_events_map); +pmu_check_events_err: + kfree(nd_pmu); +pmu_err_print: + dev_info(&p->pdev->dev, "nvdimm pmu didn't register rc=%d\n", rc); +} + /* * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the * health information. @@ -1236,6 +1454,7 @@ static int papr_scm_probe(struct platform_device *pdev) goto err2; platform_set_drvdata(pdev, p); + papr_scm_pmu_register(p); return 0; @@ -1254,6 +1473,12 @@ static int papr_scm_remove(struct platform_device *pdev) nvdimm_bus_unregister(p->bus); drc_pmem_unbind(p); + + if (pdev->archdata.priv) + unregister_nvdimm_pmu(pdev->archdata.priv); + + pdev->archdata.priv = NULL; + kfree(p->nvdimm_events_map); kfree(p->bus_desc.provider_name); kfree(p); -- cgit v1.2.3 From 3af722cb735d212554027ec81e2aa2e6bf1ee34d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 17:12:10 +0100 Subject: powerpc/net: Implement powerpc specific csum_shift() to remove branch Today's implementation of csum_shift() leads to branching based on parity of 'offset' 000002f8 : 2f8: 70 a5 00 01 andi. r5,r5,1 2fc: 41 a2 00 08 beq 304 300: 54 84 c0 3e rotlwi r4,r4,24 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr Use first bit of 'offset' directly as input of the rotation instead of branching. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 20 a5 00 20 subfic r5,r5,32 300: 5c 84 28 3e rotlw r4,r4,r5 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr And change to left shift instead of right shift to skip one more instruction. This has no impact on the final sum. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 5c 84 28 3e rotlw r4,r4,r5 300: 7c 63 20 14 addc r3,r3,r4 304: 7c 63 01 94 addze r3,r3 308: 4e 80 00 20 blr Seems like only powerpc benefits from a branchless implementation. Other main architectures like ARM or X86 get better code with the generic implementation and its branch. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- arch/powerpc/include/asm/checksum.h | 7 +++++++ include/net/checksum.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index 350de8f90250..ab3832b93f0a 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -112,6 +112,13 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum addend) #endif } +#define HAVE_ARCH_CSUM_SHIFT +static __always_inline __wsum csum_shift(__wsum sum, int offset) +{ + /* rotate sum to align it with a 16b boundary */ + return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3); +} + /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. ihl is the number diff --git a/include/net/checksum.h b/include/net/checksum.h index 79c67f14c448..6bc783b7a06c 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) return csum16_add(csum, ~addend); } +#ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ @@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int offset) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } +#endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) -- cgit v1.2.3 From d15cb3dab1e4f00e29599a4f5e1f6678a530d270 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Mar 2022 16:30:30 +0100 Subject: x86/livepatch: Validate __fentry__ location Currently livepatch assumes __fentry__ lives at func+0, which is most likely untrue with IBT on. Instead make it use ftrace_location() by default which both validates and finds the actual ip if there is any in the same symbol. Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220308154318.285971256@infradead.org --- arch/powerpc/include/asm/livepatch.h | 10 ---------- kernel/livepatch/patch.c | 19 ++----------------- 2 files changed, 2 insertions(+), 27 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 4fe018cc207b..7b9dcd51af32 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -19,16 +19,6 @@ static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) regs_set_return_ip(regs, ip); } -#define klp_get_ftrace_location klp_get_ftrace_location -static inline unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - /* - * Live patch works only with -mprofile-kernel on PPC. In this case, - * the ftrace location is always within the first 16 bytes. - */ - return ftrace_location_range(faddr, faddr + 16); -} - static inline void klp_init_thread_info(struct task_struct *p) { /* + 1 to account for STACK_END_MAGIC */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index fe316c021d73..c172bf92b576 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -124,19 +124,6 @@ unlock: ftrace_test_recursion_unlock(bit); } -/* - * Convert a function address into the appropriate ftrace location. - * - * Usually this is just the address of the function, but on some architectures - * it's more complicated so allow them to provide a custom behaviour. - */ -#ifndef klp_get_ftrace_location -static unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - return faddr; -} -#endif - static void klp_unpatch_func(struct klp_func *func) { struct klp_ops *ops; @@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func) if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func) if (!ops) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); -- cgit v1.2.3 From d1d8a3b4d06d8c9188f2b9b89ef053db0bf899de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Dec 2021 13:26:22 -0500 Subject: mm: Turn isolate_lru_page() into folio_isolate_lru() Add isolate_lru_page() as a wrapper around isolate_lru_folio(). TestClearPageLRU() would have always failed on a tail page, so returning -EBUSY is the same behaviour. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- arch/powerpc/include/asm/mmu_context.h | 1 - mm/folio-compat.c | 8 +++++++ mm/internal.h | 3 ++- mm/vmscan.c | 43 +++++++++++++++------------------- 4 files changed, 29 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index fd277b15635c..b8527a74bd4d 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -21,7 +21,6 @@ extern void destroy_context(struct mm_struct *mm); #ifdef CONFIG_SPAPR_TCE_IOMMU struct mm_iommu_table_group_mem_t; -extern int isolate_lru_page(struct page *page); /* from internal.h */ extern bool mm_iommu_preregistered(struct mm_struct *mm); extern long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 749555a232a8..a4a7725f4486 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -7,6 +7,7 @@ #include #include #include +#include "internal.h" struct address_space *page_mapping(struct page *page) { @@ -151,3 +152,10 @@ int try_to_release_page(struct page *page, gfp_t gfp) return filemap_release_folio(page_folio(page), gfp); } EXPORT_SYMBOL(try_to_release_page); + +int isolate_lru_page(struct page *page) +{ + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) + return -EBUSY; + return folio_isolate_lru((struct folio *)page); +} diff --git a/mm/internal.h b/mm/internal.h index 98b97cb5a97b..69e88fb7546e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -152,7 +152,8 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ -extern int isolate_lru_page(struct page *page); +int isolate_lru_page(struct page *page); +int folio_isolate_lru(struct folio *folio); extern void putback_lru_page(struct page *page); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); diff --git a/mm/vmscan.c b/mm/vmscan.c index 74d3e5e8ebe9..55fb6d8e30fd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2211,45 +2211,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /** - * isolate_lru_page - tries to isolate a page from its LRU list - * @page: page to isolate from its LRU list + * folio_isolate_lru() - Try to isolate a folio from its LRU list. + * @folio: Folio to isolate from its LRU list. * - * Isolates a @page from an LRU list, clears PageLRU and adjusts the - * vmstat statistic corresponding to whatever LRU list the page was on. + * Isolate a @folio from an LRU list and adjust the vmstat statistic + * corresponding to whatever LRU list the folio was on. * - * Returns 0 if the page was removed from an LRU list. - * Returns -EBUSY if the page was not on an LRU list. - * - * The returned page will have PageLRU() cleared. If it was found on - * the active list, it will have PageActive set. If it was found on - * the unevictable list, it will have the PageUnevictable bit set. That flag + * The folio will have its LRU flag cleared. If it was found on the + * active list, it will have the Active flag set. If it was found on the + * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * - * The vmstat statistic corresponding to the list on which the page was - * found will be decremented. - * - * Restrictions: + * Context: * * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages (which is called + * fundamental difference from isolate_lru_pages() (which is called * without a stable reference). - * (2) the lru_lock must not be held. - * (3) interrupts must be enabled. + * (2) The lru_lock must not be held. + * (3) Interrupts must be enabled. + * + * Return: 0 if the folio was removed from an LRU list. + * -EBUSY if the folio was not on an LRU list. */ -int isolate_lru_page(struct page *page) +int folio_isolate_lru(struct folio *folio) { - struct folio *folio = page_folio(page); int ret = -EBUSY; - VM_BUG_ON_PAGE(!page_count(page), page); - WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); - if (TestClearPageLRU(page)) { + if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; - get_page(page); + folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); - del_page_from_lru_list(page, lruvec); + lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); ret = 0; } -- cgit v1.2.3 From 9e996c2115e1b52e235261121f7c9c121262dda9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 10:30:07 -0500 Subject: powerpc: Add pmd_pfn() This is straightforward for everything except nohash64 where we indirect through pmd_page(). There must be a better way to do this. Signed-off-by: Matthew Wilcox (Oracle) --- arch/powerpc/include/asm/book3s/32/pgtable.h | 4 ++-- arch/powerpc/include/asm/nohash/32/pgtable.h | 7 +++---- arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index f8b94f78403f..95e06f2a8e23 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -372,8 +372,8 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0) -#define pmd_page(pmd) \ - pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * Encode and decode a swap entry. diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index d959c2a73fbf..dcc9b338e042 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -349,15 +349,14 @@ static inline int pte_young(pte_t pte) * of the pte page. -- paulus */ #ifndef CONFIG_BOOKE -#define pmd_page(pmd) \ - pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #else #define pmd_page_vaddr(pmd) \ ((unsigned long)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) -#define pmd_page(pmd) \ - pfn_to_page((__pa(pmd_val(pmd)) >> PAGE_SHIFT)) +#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) #endif +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * Encode and decode a swap entry. * Note that the bits we use in a PTE for representing a swap entry diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2816d158280a..78888b0c30f6 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -142,6 +142,7 @@ static inline pte_t pmd_pte(pmd_t pmd) #define pmd_present(pmd) (!pmd_none(pmd)) #define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS) extern struct page *pmd_page(pmd_t pmd); +#define pmd_pfn(pmd) (page_to_pfn(pmd_page(pmd))) static inline void pud_set(pud_t *pudp, unsigned long val) { -- cgit v1.2.3 From 16785bd7743104d57257a455001172b75afa7614 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 22 Mar 2022 14:41:47 -0700 Subject: mm: merge pte_mkhuge() call into arch_make_huge_pte() Each call into pte_mkhuge() is invariably followed by arch_make_huge_pte(). Instead arch_make_huge_pte() can accommodate pte_mkhuge() at the beginning. This updates generic fallback stub for arch_make_huge_pte() and available platforms definitions. This makes huge pte creation much cleaner and easier to follow. Link: https://lkml.kernel.org/r/1643860669-26307-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Acked-by: Mike Kravetz Acked-by: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: "David S. Miller" Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 1 + arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 4 ++-- arch/sparc/mm/hugetlbpage.c | 1 + include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 3 +-- mm/vmalloc.c | 1 - 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index ffb9c229610a..228226c5fa80 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -347,6 +347,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; + entry = pte_mkhuge(entry); if (pagesize == CONT_PTE_SIZE) { entry = pte_mkcont(entry); } else if (pagesize == CONT_PMD_SIZE) { diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 64b6c608eca4..de092b04ee1a 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags size_t size = 1UL << shift; if (size == SZ_16K) - return __pte(pte_val(entry) & ~_PAGE_HUGE); + return __pte(pte_val(entry) | _PAGE_SPS); else - return entry; + return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE); } #define arch_make_huge_pte arch_make_huge_pte #endif diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 0f49fada2093..d8e0e3c7038d 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { pte_t pte; + entry = pte_mkhuge(entry); pte = hugepage_shift_to_tte(entry, shift); #ifdef CONFIG_SPARC64 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c540..52c462390aee 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -754,7 +754,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - return entry; + return pte_mkhuge(entry); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f294db835f4b..a404af0b49a0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4637,7 +4637,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; @@ -6171,7 +6170,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); - pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); + pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4165304d3547..d0b14dd73adc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -118,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry); pfn += PFN_DOWN(size); -- cgit v1.2.3 From e16faf26780fc0c8dd693ea9ee8420a7706cb2f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 22 Mar 2022 14:43:17 -0700 Subject: cma: factor out minimum alignment requirement Patch series "mm: enforce pageblock_order < MAX_ORDER". Having pageblock_order >= MAX_ORDER seems to be able to happen in corner cases and some parts of the kernel are not prepared for it. For example, Aneesh has shown [1] that such kernels can be compiled on ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will run into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right during boot. We can get pageblock_order >= MAX_ORDER when the default hugetlb size is bigger than the maximum allocation granularity of the buddy, in which case we are no longer talking about huge pages but instead gigantic pages. Having pageblock_order >= MAX_ORDER can only make alloc_contig_range() of such gigantic pages more likely to succeed. Reliable use of gigantic pages either requires boot time allcoation or CMA, no need to overcomplicate some places in the kernel to optimize for corner cases that are broken in other areas of the kernel. This patch (of 2): Let's enforce pageblock_order < MAX_ORDER and simplify. Especially patch #1 can be regarded a cleanup before: [PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range alignment. [2] [1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com [2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20220214174132.219303-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Rob Herring Cc: Aneesh Kumar K.V Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frank Rowand Cc: Michael S. Tsirkin Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Minchan Kim Cc: Vlastimil Babka Cc: John Garry via iommu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/fadump-internal.h | 5 ----- arch/powerpc/kernel/fadump.c | 2 +- drivers/of/of_reserved_mem.c | 9 +++------ include/linux/cma.h | 9 +++++++++ kernel/dma/contiguous.c | 4 +--- mm/cma.c | 20 +++++--------------- 6 files changed, 19 insertions(+), 30 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 52189928ec08..81bcb9abb371 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -19,11 +19,6 @@ #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) -/* Alignment per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, \ - pageblock_order)) - /* FAD commands */ #define FADUMP_REGISTER 1 #define FADUMP_UNREGISTER 2 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9c..7eb67201ea41 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -544,7 +544,7 @@ int __init fadump_reserve_mem(void) if (!fw_dump.nocma) { fw_dump.boot_memory_size = ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + CMA_MIN_ALIGNMENT_BYTES); } #endif diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9c0fb962c22b..75caa6f5d36f 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "of_private.h" @@ -116,12 +117,8 @@ static int __init __reserved_mem_alloc_size(unsigned long node, if (IS_ENABLED(CONFIG_CMA) && of_flat_dt_is_compatible(node, "shared-dma-pool") && of_get_flat_dt_prop(node, "reusable", NULL) - && !nomap) { - unsigned long order = - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - - align = max(align, (phys_addr_t)PAGE_SIZE << order); - } + && !nomap) + align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); prop = of_get_flat_dt_prop(node, "alloc-ranges", &len); if (prop) { diff --git a/include/linux/cma.h b/include/linux/cma.h index bd801023504b..75fe188ec4a1 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -20,6 +20,15 @@ #define CMA_MAX_NAME 64 +/* + * TODO: once the buddy -- especially pageblock merging and alloc_contig_range() + * -- can deal with only some pageblocks of a higher-order page being + * MIGRATE_CMA, we can use pageblock_nr_pages. + */ +#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \ + pageblock_nr_pages) +#define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) + struct cma; extern unsigned long totalcma_pages; diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..6ea80ae42622 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/mm/cma.c b/mm/cma.c index bc9ca8f3c487..5a2cd5851658 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -168,7 +168,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma) { struct cma *cma; - phys_addr_t alignment; /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { @@ -179,15 +178,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - /* alignment should be aligned with order_per_bit */ - if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) return -EINVAL; - if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + /* ensure minimal alignment required by mm core */ + if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; /* @@ -262,14 +258,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; - /* - * Sanitise input arguments. - * Pages both ends in CMA area could be merged into adjacent unmovable - * migratetype page by page allocator's buddy algorithm. In the case, - * you couldn't get a contiguous memory, which is not what we want. - */ - alignment = max(alignment, (phys_addr_t)PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", -- cgit v1.2.3 From f82da161ea75dc4db21b2499e4b1facd36dab275 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 28 Mar 2022 08:55:39 +1100 Subject: powerpc: restore removed #endif Fixes: 7001052160d1 ("Merge tag 'x86_core_for_5.18_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip") Signed-off-by: Stephen Rothwell Brown-paper-bag-by: Linus Torvalds Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/livepatch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index fd65931a739f..1c60094ea0cd 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -16,6 +16,7 @@ static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { ftrace_instruction_pointer_set(fregs, ip); } +#endif /* CONFIG_LIVEPATCH */ #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) -- cgit v1.2.3 From 7f921a2d6c93051b6002dbb7c1781f1fa5b88cce Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 3 Apr 2022 22:12:52 +1000 Subject: KVM: PPC: Move kvmhv_on_pseries() into kvm_ppc.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We recently introduced a usage of kvmhv_on_pseries() in powerpc.c, which causes a build error for ppc64_book3e_allmodconfig: arch/powerpc/kvm/powerpc.c:716:8: error: implicit declaration of function ‘kvmhv_on_pseries’ 716 | if (kvmhv_on_pseries()) { | ^~~~~~~~~~~~~~~~ Fix it by moving kvmhv_on_pseries() into kvm_ppc.h so that the stub version is available for book3e builds. Fixes: f771b55731fc ("KVM: PPC: Use KVM_CAP_PPC_AIL_MODE_3") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s_64.h | 12 ------------ arch/powerpc/include/asm/kvm_ppc.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 827038a33064..4def2bd17b9b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -16,18 +16,6 @@ #include #include -#ifdef CONFIG_PPC_PSERIES -static inline bool kvmhv_on_pseries(void) -{ - return !cpu_has_feature(CPU_FTR_HVMODE); -} -#else -static inline bool kvmhv_on_pseries(void) -{ - return false; -} -#endif - /* * Structure for a nested guest, that is, for a guest that is managed by * one of our guests. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c583d0c37f31..838d4cb460b7 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -586,6 +586,18 @@ static inline bool kvm_hv_mode_active(void) { return false; } #endif +#ifdef CONFIG_PPC_PSERIES +static inline bool kvmhv_on_pseries(void) +{ + return !cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool kvmhv_on_pseries(void) +{ + return false; +} +#endif + #ifdef CONFIG_KVM_XICS static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 5517d500829c683a358a8de04ecb2e28af629ae5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 14 Mar 2022 11:27:35 +0100 Subject: static_call: Properly initialise DEFINE_STATIC_CALL_RET0() When a static call is updated with __static_call_return0() as target, arch_static_call_transform() set it to use an optimised set of instructions which are meant to lay in the same cacheline. But when initialising a static call with DEFINE_STATIC_CALL_RET0(), we get a branch to the real __static_call_return0() function instead of getting the optimised setup: c00d8120 <__SCT__perf_snapshot_branch_stack>: c00d8120: 4b ff ff f4 b c00d8114 <__static_call_return0> c00d8124: 3d 80 c0 0e lis r12,-16370 c00d8128: 81 8c 81 3c lwz r12,-32452(r12) c00d812c: 7d 89 03 a6 mtctr r12 c00d8130: 4e 80 04 20 bctr c00d8134: 38 60 00 00 li r3,0 c00d8138: 4e 80 00 20 blr c00d813c: 00 00 00 00 .long 0x0 Add ARCH_DEFINE_STATIC_CALL_RET0_TRAMP() defined by each architecture to setup the optimised configuration, and rework DEFINE_STATIC_CALL_RET0() to call it: c00d8120 <__SCT__perf_snapshot_branch_stack>: c00d8120: 48 00 00 14 b c00d8134 <__SCT__perf_snapshot_branch_stack+0x14> c00d8124: 3d 80 c0 0e lis r12,-16370 c00d8128: 81 8c 81 3c lwz r12,-32452(r12) c00d812c: 7d 89 03 a6 mtctr r12 c00d8130: 4e 80 04 20 bctr c00d8134: 38 60 00 00 li r3,0 c00d8138: 4e 80 00 20 blr c00d813c: 00 00 00 00 .long 0x0 Signed-off-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/1e0a61a88f52a460f62a58ffc2a5f847d1f7d9d8.1647253456.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/static_call.h | 1 + arch/x86/include/asm/static_call.h | 2 ++ include/linux/static_call.h | 20 +++++++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/static_call.h b/arch/powerpc/include/asm/static_call.h index 0a0bc79bd1fa..de1018cc522b 100644 --- a/arch/powerpc/include/asm/static_call.h +++ b/arch/powerpc/include/asm/static_call.h @@ -24,5 +24,6 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) __PPC_SCT(name, "b " #func) #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) __PPC_SCT(name, "blr") +#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) __PPC_SCT(name, "b .+20") #endif /* _ASM_POWERPC_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index ed4f8bb6c2d9..2455d721503e 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -38,6 +38,8 @@ #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop") +#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \ + ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0) #define ARCH_ADD_TRAMP_KEY(name) \ asm(".pushsection .static_call_tramp_key, \"a\" \n" \ diff --git a/include/linux/static_call.h b/include/linux/static_call.h index fcc5b48989b3..3c50b0fdda16 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -196,6 +196,14 @@ extern long __static_call_return0(void); }; \ ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) +#define DEFINE_STATIC_CALL_RET0(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = __static_call_return0, \ + .type = 1, \ + }; \ + ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) + #define static_call_cond(name) (void)__static_call(name) #define EXPORT_STATIC_CALL(name) \ @@ -231,6 +239,12 @@ static inline int static_call_init(void) { return 0; } }; \ ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) +#define DEFINE_STATIC_CALL_RET0(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = __static_call_return0, \ + }; \ + ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) #define static_call_cond(name) (void)__static_call(name) @@ -284,6 +298,9 @@ static inline long __static_call_return0(void) .func = NULL, \ } +#define DEFINE_STATIC_CALL_RET0(name, _func) \ + __DEFINE_STATIC_CALL(name, _func, __static_call_return0) + static inline void __static_call_nop(void) { } /* @@ -327,7 +344,4 @@ static inline int static_call_text_reserved(void *start, void *end) #define DEFINE_STATIC_CALL(name, _func) \ __DEFINE_STATIC_CALL(name, _func, _func) -#define DEFINE_STATIC_CALL_RET0(name, _func) \ - __DEFINE_STATIC_CALL(name, _func, __static_call_return0) - #endif /* _LINUX_STATIC_CALL_H */ -- cgit v1.2.3 From ffa0b64e3be58519ae472ea29a1a1ad681e32f48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 7 Apr 2022 00:57:57 +1000 Subject: powerpc: Fix virt_addr_valid() for 64-bit Book3E & 32-bit mpe: On 64-bit Book3E vmalloc space starts at 0x8000000000000000. Because of the way __pa() works we have: __pa(0x8000000000000000) == 0, and therefore virt_to_pfn(0x8000000000000000) == 0, and therefore virt_addr_valid(0x8000000000000000) == true Which is wrong, virt_addr_valid() should be false for vmalloc space. In fact all vmalloc addresses that alias with a valid PFN will return true from virt_addr_valid(). That can cause bugs with hardened usercopy as described below by Kefeng Wang: When running ethtool eth0 on 64-bit Book3E, a BUG occurred: usercopy: Kernel memory exposure attempt detected from SLUB object not in SLUB page?! (offset 0, size 1048)! kernel BUG at mm/usercopy.c:99 ... usercopy_abort+0x64/0xa0 (unreliable) __check_heap_object+0x168/0x190 __check_object_size+0x1a0/0x200 dev_ethtool+0x2494/0x2b20 dev_ioctl+0x5d0/0x770 sock_do_ioctl+0xf0/0x1d0 sock_ioctl+0x3ec/0x5a0 __se_sys_ioctl+0xf0/0x160 system_call_exception+0xfc/0x1f0 system_call_common+0xf8/0x200 The code shows below, data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) The data is alloced by vmalloc(), virt_addr_valid(ptr) will return true on 64-bit Book3E, which leads to the panic. As commit 4dd7554a6456 ("powerpc/64: Add VIRTUAL_BUG_ON checks for __va and __pa addresses") does, make sure the virt addr above PAGE_OFFSET in the virt_addr_valid() for 64-bit, also add upper limit check to make sure the virt is below high_memory. Meanwhile, for 32-bit PAGE_OFFSET is the virtual address of the start of lowmem, high_memory is the upper low virtual address, the check is suitable for 32-bit, this will fix the issue mentioned in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly") too. On 32-bit there is a similar problem with high memory, that was fixed in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly"), but that commit breaks highmem and needs to be reverted. We can't easily fix __pa(), we have code that relies on its current behaviour. So for now add extra checks to virt_addr_valid(). For 64-bit Book3S the extra checks are not necessary, the combination of virt_to_pfn() and pfn_valid() should yield the correct result, but they are harmless. Signed-off-by: Kefeng Wang Reviewed-by: Christophe Leroy [mpe: Add additional change log detail] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220406145802.538416-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 254687258f42..f2c5c26869f1 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -132,7 +132,11 @@ static inline bool pfn_valid(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) +#define virt_addr_valid(vaddr) ({ \ + unsigned long _addr = (unsigned long)vaddr; \ + _addr >= PAGE_OFFSET && _addr < (unsigned long)high_memory && \ + pfn_valid(virt_to_pfn(_addr)); \ +}) /* * On Book-E parts we need __va to parse the device tree and we can't -- cgit v1.2.3 From 8ba2ed1be90fc210126f68186564707478552c95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:36:57 +0200 Subject: swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction Power SVM wants to allocate a swiotlb buffer that is not restricted to low memory for the trusted hypervisor scheme. Consolidate the support for this into the swiotlb_init interface by adding a new flag. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/powerpc/include/asm/svm.h | 4 ---- arch/powerpc/include/asm/swiotlb.h | 1 + arch/powerpc/kernel/dma-swiotlb.c | 1 + arch/powerpc/mm/mem.c | 5 +---- arch/powerpc/platforms/pseries/svm.c | 26 +------------------------- include/linux/swiotlb.h | 1 + kernel/dma/swiotlb.c | 11 +++++++++-- 7 files changed, 14 insertions(+), 35 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index 7546402d796a..85580b30aba4 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,8 +15,6 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } -void __init svm_swiotlb_init(void); - void dtl_cache_ctor(void *addr); #define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) @@ -27,8 +25,6 @@ static inline bool is_secure_guest(void) return false; } -static inline void svm_swiotlb_init(void) {} - #define get_dtl_cache_ctor() NULL #endif /* CONFIG_PPC_SVM */ diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 3c1a1cd16128..4203b5e0a88e 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -9,6 +9,7 @@ #include extern unsigned int ppc_swiotlb_enable; +extern unsigned int ppc_swiotlb_flags; #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index fc7816126a40..ba256c37bcc0 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,6 +10,7 @@ #include unsigned int ppc_swiotlb_enable; +unsigned int ppc_swiotlb_flags; void __init swiotlb_detect_4g(void) { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 74ca516c3e7e..46fb78e3bb36 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -249,10 +249,7 @@ void __init mem_init(void) * back to to-down. */ memblock_set_bottom_up(true); - if (is_secure_guest()) - svm_swiotlb_init(); - else - swiotlb_init(ppc_swiotlb_enable, 0); + swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index c5228f4969eb..3b4045d508ec 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -28,7 +28,7 @@ static int __init init_svm(void) * need to use the SWIOTLB buffer for DMA even if dma_capable() says * otherwise. */ - swiotlb_force = SWIOTLB_FORCE; + ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE; /* Share the SWIOTLB buffer with the host. */ swiotlb_update_mem_attributes(); @@ -37,30 +37,6 @@ static int __init init_svm(void) } machine_early_initcall(pseries, init_svm); -/* - * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it - * can allocate the buffer anywhere in memory. Since the hypervisor doesn't have - * any addressing limitation, we don't need to allocate it in low addresses. - */ -void __init svm_swiotlb_init(void) -{ - unsigned char *vstart; - unsigned long bytes, io_tlb_nslabs; - - io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false)) - return; - - - memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - panic("SVM: Cannot allocate SWIOTLB buffer"); -} - int set_memory_encrypted(unsigned long addr, int numpages) { if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ae0407173e84..eabdd8998702 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -15,6 +15,7 @@ struct scatterlist; #define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */ #define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */ +#define SWIOTLB_ANY (1 << 2) /* allow any memory for the buffer */ /* * Maximum allowable number of contiguous slabs to map, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 86e877a96b82..f6e091424af3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -266,8 +266,15 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags) if (swiotlb_force_disable) return; - /* Get IO TLB memory from the low pages */ - tlb = memblock_alloc_low(bytes, PAGE_SIZE); + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) goto fail; if (swiotlb_init_with_tbl(tlb, default_nslabs, flags)) -- cgit v1.2.3 From 755a9d44e6e2a217cceaab9fe2c8bac55ee7f8b2 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 Apr 2022 14:08:15 -0500 Subject: powerpc: Remove unused SLOW_DOWN_IO definition Remove unused SLOW_DOWN_IO definition. Signed-off-by: Bjorn Helgaas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220415190817.842864-6-helgaas@kernel.org --- arch/powerpc/include/asm/io.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index fee979d3a1aa..c5a5f7c9b231 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -38,8 +38,6 @@ extern struct pci_dev *isa_bridge_pcidev; #define SIO_CONFIG_RA 0x398 #define SIO_CONFIG_RD 0x399 -#define SLOW_DOWN_IO - /* 32 bits uses slightly different variables for the various IO * bases. Most of this file only uses _IO_BASE though which we * define properly based on the platform -- cgit v1.2.3 From 6584cec0a2255ab407d047d1b135fa0aae88d6c6 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 4 Apr 2022 23:51:37 +0530 Subject: powerpc/fadump: save CPU reg data in vmcore when PHYP terminates LPAR An LPAR can be terminated by the POWER Hypervisor (PHYP) for various reasons. If FADump was configured when PHYP terminates the LPAR, platform-assisted dump is initiated to save the kernel dump. But CPU register data would not be processed/saved in the vmcore in such case because CPU mask is set in crash_fadump() at the time of kernel crash and it remains unset in this case with LPAR being terminated by PHYP abruptly. To get around the problem, initialize cpu_mask to cpu_possible_mask so as to ensure all possible CPUs' register data is processed for the vmcore generated on PHYP terminated LPAR. Also, rename the crash info member variable from online_mask to cpu_mask as it doesn't necessarily have to be online CPU mask always. Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220404182137.59231-1-hbathini@linux.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 2 +- arch/powerpc/kernel/fadump.c | 7 ++++++- arch/powerpc/platforms/pseries/rtas-fadump.c | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 81bcb9abb371..27f9e11eda28 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -50,7 +50,7 @@ struct fadump_crash_info_header { u64 elfcorehdr_addr; u32 crashing_cpu; struct pt_regs regs; - struct cpumask online_mask; + struct cpumask cpu_mask; }; struct fadump_memory_range { diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 65562c4a0a69..8343c0b14277 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -728,7 +728,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) else ppc_save_regs(&fdh->regs); - fdh->online_mask = *cpu_online_mask; + fdh->cpu_mask = *cpu_online_mask; /* * If we came in via system reset, wait a while for the secondary @@ -1164,6 +1164,11 @@ static unsigned long init_fadump_header(unsigned long addr) fdh->elfcorehdr_addr = addr; /* We will set the crashing cpu id in crash_fadump() during crash. */ fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; + /* + * When LPAR is terminated by PYHP, ensure all possible CPUs' + * register data is processed while exporting the vmcore. + */ + fdh->cpu_mask = *cpu_possible_mask; return addr; } diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 35f9cb602c30..617c0f3b1f4f 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -351,7 +351,7 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) /* Lower 4 bytes of reg_value contains logical cpu id */ cpu = (be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK); - if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_mask)) { RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); continue; } -- cgit v1.2.3 From 306f7cc1e9061313c46d19e9bdffc819880794c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:56 +0800 Subject: uapi: always define F_GETLK64/F_SETLK64/F_SETLKW64 in fcntl.h The F_GETLK64/F_SETLK64/F_SETLKW64 fcntl opcodes are only implemented for the 32-bit syscall APIs, but are also needed for compat handling on 64-bit kernels. Consolidate them in unistd.h instead of definining the internal compat definitions in compat.h, which is rather error prone (e.g. parisc gets the values wrong currently). Note that before this change they were never visible to userspace due to the fact that CONFIG_64BIT is only set for kernel builds. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220405071314.3225832-3-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 4 ---- arch/mips/include/asm/compat.h | 4 ---- arch/mips/include/uapi/asm/fcntl.h | 4 ++-- arch/powerpc/include/asm/compat.h | 4 ---- arch/s390/include/asm/compat.h | 4 ---- arch/sparc/include/asm/compat.h | 4 ---- arch/x86/include/asm/compat.h | 4 ---- include/uapi/asm-generic/fcntl.h | 4 ++-- tools/include/uapi/asm-generic/fcntl.h | 2 -- 9 files changed, 4 insertions(+), 30 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index eaa6ca062d89..276328765408 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -73,10 +73,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index bbb3bc5a42fd..6a350c1f70d7 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -65,10 +65,6 @@ struct compat_flock { s32 pad[4]; }; -#define F_GETLK64 33 -#define F_SETLK64 34 -#define F_SETLKW64 35 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/mips/include/uapi/asm/fcntl.h b/arch/mips/include/uapi/asm/fcntl.h index 9e44ac810db9..0369a38e3d4f 100644 --- a/arch/mips/include/uapi/asm/fcntl.h +++ b/arch/mips/include/uapi/asm/fcntl.h @@ -44,11 +44,11 @@ #define F_SETOWN 24 /* for sockets. */ #define F_GETOWN 23 /* for sockets. */ -#ifndef __mips64 +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #define F_GETLK64 33 /* using 'struct flock64' */ #define F_SETLK64 34 #define F_SETLKW64 35 -#endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #if _MIPS_SIM != _MIPS_SIM_ABI64 #define __ARCH_FLOCK_EXTRA_SYSID long l_sysid; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 7afc96fb6524..83d8f70779cb 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -52,10 +52,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index cdc7ae72529d..0f14b3188b1b 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -110,10 +110,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index bd949fcf9d63..108078751bb5 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -84,10 +84,6 @@ struct compat_flock { short __unused; }; -#define F_GETLK64 12 -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7516e4199b3c..8d19a212f4f2 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -58,10 +58,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 77aa9f2ff98d..f13d37b60775 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -116,13 +116,13 @@ #define F_GETSIG 11 /* for sockets. */ #endif -#ifndef CONFIG_64BIT +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif -#endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index 99bc9b15ce2b..0197042b7dfb 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -115,13 +115,11 @@ #define F_GETSIG 11 /* for sockets. */ #endif -#ifndef CONFIG_64BIT #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif -#endif #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 -- cgit v1.2.3 From 3ce0f2373f7073f04715b1ffd7b1812d3183f79a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:57 +0800 Subject: compat: consolidate the compat_flock{,64} definition Provide a single common definition for the compat_flock and compat_flock64 structures using the same tricks as for the native variants. Another extra define is added for the packing required on x86. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-4-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 16 ---------------- arch/mips/include/asm/compat.h | 19 ++----------------- arch/parisc/include/asm/compat.h | 16 ---------------- arch/powerpc/include/asm/compat.h | 16 ---------------- arch/s390/include/asm/compat.h | 16 ---------------- arch/sparc/include/asm/compat.h | 18 +----------------- arch/x86/include/asm/compat.h | 20 +++----------------- include/linux/compat.h | 31 +++++++++++++++++++++++++++++++ 8 files changed, 37 insertions(+), 115 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 276328765408..e0faec1984a1 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -65,22 +65,6 @@ struct compat_stat { compat_ulong_t __unused4[2]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { int f_type; int f_bsize; diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 6a350c1f70d7..6d6e5a451f4d 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -55,23 +55,8 @@ struct compat_stat { s32 st_pad4[14]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - s32 l_sysid; - compat_pid_t l_pid; - s32 pad[4]; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; +#define __ARCH_COMPAT_FLOCK_EXTRA_SYSID s32 l_sysid; +#define __ARCH_COMPAT_FLOCK_PAD s32 pad[4]; struct compat_statfs { int f_type; diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index c04f5a637c39..a1e4534d8050 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -53,22 +53,6 @@ struct compat_stat { u32 st_spare4[3]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { s32 f_type; s32 f_bsize; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 83d8f70779cb..5ef3c7c83c34 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -44,22 +44,6 @@ struct compat_stat { u32 __unused4[2]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { int f_type; int f_bsize; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 0f14b3188b1b..07f04d37068b 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -102,22 +102,6 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { u32 f_type; u32 f_bsize; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 108078751bb5..d78fb44942e0 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -75,23 +75,7 @@ struct compat_stat64 { unsigned int __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; - short __unused; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; - short __unused; -}; +#define __ARCH_COMPAT_FLOCK_PAD short __unused; struct compat_statfs { int f_type; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 8d19a212f4f2..de794d895866 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -50,25 +50,11 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - /* - * IA32 uses 4 byte alignment for 64 bit quantities, - * so we need to pack this structure. + * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the + * compat flock64 structure. */ -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -} __attribute__((packed)); +#define __ARCH_NEED_COMPAT_FLOCK64_PACKED struct compat_statfs { int f_type; diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..a0481fe6c5d5 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -258,6 +258,37 @@ struct compat_rlimit { compat_ulong_t rlim_max; }; +#ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED +#define __ARCH_COMPAT_FLOCK64_PACK __attribute__((packed)) +#else +#define __ARCH_COMPAT_FLOCK64_PACK +#endif + +struct compat_flock { + short l_type; + short l_whence; + compat_off_t l_start; + compat_off_t l_len; +#ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID + __ARCH_COMPAT_FLOCK_EXTRA_SYSID +#endif + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK_PAD + __ARCH_COMPAT_FLOCK_PAD +#endif +}; + +struct compat_flock64 { + short l_type; + short l_whence; + compat_loff_t l_start; + compat_loff_t l_len; + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK64_PAD + __ARCH_COMPAT_FLOCK64_PAD +#endif +} __ARCH_COMPAT_FLOCK64_PACK; + struct compat_rusage { struct old_timeval32 ru_utime; struct old_timeval32 ru_stime; -- cgit v1.2.3 From f18ed30db299458f809aec55bf1800dbeebeb953 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:12:59 +0800 Subject: fs: stat: compat: Add __ARCH_WANT_COMPAT_STAT RISC-V doesn't neeed compat_stat, so using __ARCH_WANT_COMPAT_STAT to exclude unnecessary SYSCALL functions. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-6-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/unistd.h | 1 + arch/mips/include/asm/unistd.h | 2 ++ arch/parisc/include/asm/unistd.h | 1 + arch/powerpc/include/asm/unistd.h | 1 + arch/s390/include/asm/unistd.h | 1 + arch/sparc/include/asm/unistd.h | 1 + arch/x86/include/asm/unistd.h | 1 + fs/stat.c | 2 +- 8 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 4e65da3445c7..037feba03a51 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -3,6 +3,7 @@ * Copyright (C) 2012 ARM Ltd. */ #ifdef CONFIG_COMPAT +#define __ARCH_WANT_COMPAT_STAT #define __ARCH_WANT_COMPAT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index c2196b1b6604..25a5253db7f4 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -50,6 +50,8 @@ # ifdef CONFIG_32BIT # define __ARCH_WANT_STAT64 # define __ARCH_WANT_SYS_TIME32 +# else +# define __ARCH_WANT_COMPAT_STAT # endif # ifdef CONFIG_MIPS32_O32 # define __ARCH_WANT_SYS_TIME32 diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 7708a5806f09..81cbad73b517 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -164,6 +164,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_COMPAT_SYS_SENDFILE +#define __ARCH_WANT_COMPAT_STAT #ifdef CONFIG_64BIT #define __ARCH_WANT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 5eb462af6766..b1129b4ef57d 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -44,6 +44,7 @@ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_NEWFSTATAT +#define __ARCH_WANT_COMPAT_STAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif #define __ARCH_WANT_SYS_FORK diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 9e9f75ef046a..4260bc5ce7f8 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -28,6 +28,7 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK # ifdef CONFIG_COMPAT +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_SYS_TIME32 # define __ARCH_WANT_SYS_UTIME32 # endif diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 1e66278ba4a5..d6bc76706a7a 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -46,6 +46,7 @@ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_COMPAT_SYS_SENDFILE +#define __ARCH_WANT_COMPAT_STAT #endif #ifdef __32bit_syscall_numbers__ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 80e9d5206a71..761173ccc33c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -22,6 +22,7 @@ # include # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 diff --git a/fs/stat.c b/fs/stat.c index 7f734be0e57e..5d8f723c1e0b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -660,7 +660,7 @@ SYSCALL_DEFINE5(statx, return ret; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; -- cgit v1.2.3 From 84a0c977ab9821a4b57f69d9a6108f64188d1e71 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:13:00 +0800 Subject: asm-generic: compat: Cleanup duplicate definitions There are 7 64bit architectures that support Linux COMPAT mode to run 32bit applications. A lot of definitions are duplicate: - COMPAT_USER_HZ - COMPAT_RLIM_INFINITY - COMPAT_OFF_T_MAX - __compat_uid_t, __compat_uid_t - compat_dev_t - compat_ipc_pid_t - struct compat_flock - struct compat_flock64 - struct compat_statfs - struct compat_ipc64_perm, compat_semid64_ds, compat_msqid64_ds, compat_shmid64_ds Cleanup duplicate definitions and merge them into asm-generic. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-7-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 73 ++++---------------------- arch/mips/include/asm/compat.h | 18 +++---- arch/parisc/include/asm/compat.h | 29 ++--------- arch/powerpc/include/asm/compat.h | 30 ++--------- arch/s390/include/asm/compat.h | 79 +++++----------------------- arch/sparc/include/asm/compat.h | 39 +++++--------- arch/x86/include/asm/compat.h | 80 +++++----------------------- include/asm-generic/compat.h | 106 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 170 insertions(+), 284 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index e0faec1984a1..9f362274a4f7 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -8,6 +8,15 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include #ifdef CONFIG_COMPAT @@ -19,21 +28,15 @@ typedef u16 compat_mode_t; #include #include -#define COMPAT_USER_HZ 100 #ifdef __AARCH64EB__ #define COMPAT_UTS_MACHINE "armv8b\0\0" #else #define COMPAT_UTS_MACHINE "armv8l\0\0" #endif -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; typedef u16 __compat_uid16_t; typedef u16 __compat_gid16_t; -typedef u32 compat_dev_t; typedef s32 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { #ifdef __AARCH64EB__ @@ -87,64 +90,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - compat_ulong_t unused1; - compat_ulong_t unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused3; - compat_ulong_t __unused4; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - static inline int is_compat_task(void) { return test_thread_flag(TIF_32BIT); diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 6d6e5a451f4d..ec01dc000a41 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -9,28 +9,28 @@ #include #include +#define __compat_uid_t __compat_uid_t typedef s32 __compat_uid_t; typedef s32 __compat_gid_t; + typedef __compat_uid_t __compat_uid32_t; typedef __compat_gid_t __compat_gid32_t; #define __compat_uid32_t __compat_uid32_t -#define __compat_gid32_t __compat_gid32_t + +#define compat_statfs compat_statfs +#define compat_ipc64_perm compat_ipc64_perm #define _COMPAT_NSIG 128 /* Don't ask !$@#% ... */ #define _COMPAT_NSIG_BPW 32 typedef u32 compat_sigset_word; +#define COMPAT_RLIM_INFINITY 0x7fffffffUL + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "mips\0\0\0" -typedef u32 compat_dev_t; typedef u32 compat_nlink_t; -typedef s32 compat_ipc_pid_t; -typedef struct { - s32 val[2]; -} compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -73,10 +73,6 @@ struct compat_statfs { int f_spare[5]; }; -#define COMPAT_RLIM_INFINITY 0x7fffffffUL - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index a1e4534d8050..339d1b833fa7 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -11,16 +11,16 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "parisc\0\0" -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; -typedef u32 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; struct compat_stat { compat_dev_t st_dev; /* dev_t is 32 bits on parisc */ @@ -53,21 +53,6 @@ struct compat_stat { u32 st_spare4[3]; }; -struct compat_statfs { - s32 f_type; - s32 f_bsize; - s32 f_blocks; - s32 f_bfree; - s32 f_bavail; - s32 f_files; - s32 f_ffree; - __kernel_fsid_t f_fsid; - s32 f_namelen; - s32 f_frsize; - s32 f_flags; - s32 f_spare[4]; -}; - struct compat_sigcontext { compat_int_t sc_flags; compat_int_t sc_gr[32]; /* PSW in sc_gr[0] */ @@ -77,10 +62,6 @@ struct compat_sigcontext { compat_int_t sc_sar; /* cr11 */ }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid_t uid; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 5ef3c7c83c34..dda4091fd012 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -8,21 +8,20 @@ #include #include +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + #include -#define COMPAT_USER_HZ 100 #ifdef __BIG_ENDIAN__ #define COMPAT_UTS_MACHINE "ppc\0\0" #else #define COMPAT_UTS_MACHINE "ppcle\0\0" #endif -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; -typedef u32 compat_dev_t; typedef s16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -44,25 +43,6 @@ struct compat_stat { u32 __unused4[2]; }; -struct compat_statfs { - int f_type; - int f_bsize; - int f_blocks; - int f_bfree; - int f_bavail; - int f_files; - int f_ffree; - compat_fsid_t f_fsid; - int f_namelen; /* SunOS ignores this field. */ - int f_frsize; - int f_flags; - int f_spare[4]; -}; - -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 07f04d37068b..ad809cf3dd97 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -12,6 +12,18 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include #define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ @@ -53,15 +65,9 @@ typedef u16 compat_mode_t; PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ PSW32_ASC_PRIMARY) -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "s390\0\0\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; typedef struct { u32 mask; @@ -132,10 +138,6 @@ struct compat_statfs64 { u32 f_spare[4]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - /* * A pointer passed in from user mode. This should not * be used for syscall parameters, just declare them @@ -158,61 +160,4 @@ static inline int is_compat_task(void) #endif -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - compat_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int __unused1; - unsigned int __unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; #endif /* _ASM_S390X_COMPAT_H */ diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index d78fb44942e0..e4382d2efa56 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -9,17 +9,25 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + +#define COMPAT_RLIM_INFINITY 0x7fffffff + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "sparc\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef s16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -77,25 +85,6 @@ struct compat_stat64 { #define __ARCH_COMPAT_FLOCK_PAD short __unused; -struct compat_statfs { - int f_type; - int f_bsize; - int f_blocks; - int f_bfree; - int f_bavail; - int f_files; - int f_ffree; - compat_fsid_t f_fsid; - int f_namelen; /* SunOS ignores this field. */ - int f_frsize; - int f_flags; - int f_spare[4]; -}; - -#define COMPAT_RLIM_INFINITY 0x7fffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index de794d895866..e74a107de0d0 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -15,17 +15,23 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -71,68 +77,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - compat_ulong_t unused1; - compat_ulong_t unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused3; - compat_ulong_t __unused4; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h index d46c0201cc34..11653d6846cc 100644 --- a/include/asm-generic/compat.h +++ b/include/asm-generic/compat.h @@ -2,6 +2,18 @@ #ifndef __ASM_GENERIC_COMPAT_H #define __ASM_GENERIC_COMPAT_H +#ifndef COMPAT_USER_HZ +#define COMPAT_USER_HZ 100 +#endif + +#ifndef COMPAT_RLIM_INFINITY +#define COMPAT_RLIM_INFINITY 0xffffffff +#endif + +#ifndef COMPAT_OFF_T_MAX +#define COMPAT_OFF_T_MAX 0x7fffffff +#endif + /* These types are common across all compat ABIs */ typedef u32 compat_size_t; typedef s32 compat_ssize_t; @@ -24,6 +36,11 @@ typedef u32 compat_caddr_t; typedef u32 compat_aio_context_t; typedef u32 compat_old_sigset_t; +#ifndef __compat_uid_t +typedef u32 __compat_uid_t; +typedef u32 __compat_gid_t; +#endif + #ifndef __compat_uid32_t typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; @@ -47,4 +64,93 @@ typedef u32 compat_sigset_word; #define _COMPAT_NSIG_BPW 32 #endif +#ifndef compat_dev_t +typedef u32 compat_dev_t; +#endif + +#ifndef compat_ipc_pid_t +typedef s32 compat_ipc_pid_t; +#endif + +#ifndef compat_fsid_t +typedef __kernel_fsid_t compat_fsid_t; +#endif + +#ifndef compat_statfs +struct compat_statfs { + compat_int_t f_type; + compat_int_t f_bsize; + compat_int_t f_blocks; + compat_int_t f_bfree; + compat_int_t f_bavail; + compat_int_t f_files; + compat_int_t f_ffree; + compat_fsid_t f_fsid; + compat_int_t f_namelen; + compat_int_t f_frsize; + compat_int_t f_flags; + compat_int_t f_spare[4]; +}; +#endif + +#ifndef compat_ipc64_perm +struct compat_ipc64_perm { + compat_key_t key; + __compat_uid32_t uid; + __compat_gid32_t gid; + __compat_uid32_t cuid; + __compat_gid32_t cgid; + compat_mode_t mode; + unsigned char __pad1[4 - sizeof(compat_mode_t)]; + compat_ushort_t seq; + compat_ushort_t __pad2; + compat_ulong_t unused1; + compat_ulong_t unused2; +}; + +struct compat_semid64_ds { + struct compat_ipc64_perm sem_perm; + compat_ulong_t sem_otime; + compat_ulong_t sem_otime_high; + compat_ulong_t sem_ctime; + compat_ulong_t sem_ctime_high; + compat_ulong_t sem_nsems; + compat_ulong_t __unused3; + compat_ulong_t __unused4; +}; + +struct compat_msqid64_ds { + struct compat_ipc64_perm msg_perm; + compat_ulong_t msg_stime; + compat_ulong_t msg_stime_high; + compat_ulong_t msg_rtime; + compat_ulong_t msg_rtime_high; + compat_ulong_t msg_ctime; + compat_ulong_t msg_ctime_high; + compat_ulong_t msg_cbytes; + compat_ulong_t msg_qnum; + compat_ulong_t msg_qbytes; + compat_pid_t msg_lspid; + compat_pid_t msg_lrpid; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; + +struct compat_shmid64_ds { + struct compat_ipc64_perm shm_perm; + compat_size_t shm_segsz; + compat_ulong_t shm_atime; + compat_ulong_t shm_atime_high; + compat_ulong_t shm_dtime; + compat_ulong_t shm_dtime_high; + compat_ulong_t shm_ctime; + compat_ulong_t shm_ctime_high; + compat_pid_t shm_cpid; + compat_pid_t shm_lpid; + compat_ulong_t shm_nattch; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; +#endif + #endif -- cgit v1.2.3 From 634093c59a12fc90e33c4e5acd1da4498fd159d4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: powerpc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. While here, this also localizes arch_vm_get_page_prot() as __vm_get_page_prot() and moves it near vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Michael Ellerman Cc: Paul Mackerras Cc: Catalin Marinas Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/mman.h | 12 ------------ arch/powerpc/mm/book3s64/pgtable.c | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 174edabb74fa..69e44358a235 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,6 +140,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 7cb6d18f5cd6..1b024e64c8ec 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -24,18 +24,6 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) -static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) -{ -#ifdef CONFIG_PPC_MEM_KEYS - return (vm_flags & VM_SAO) ? - __pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) : - __pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags)); -#else - return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0); -#endif -} -#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) - static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_SAO)) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 052e6590f84f..8b474ab32f67 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -549,3 +550,19 @@ unsigned long memremap_compat_align(void) } EXPORT_SYMBOL_GPL(memremap_compat_align); #endif + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long prot = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3 From ce0091a0e06045e70c526c8735f8b866a85e0a45 Mon Sep 17 00:00:00 2001 From: He Ying Date: Wed, 24 Mar 2021 05:09:39 -0400 Subject: powerpc/time: Fix sparse warnings We found these warnings in arch/powerpc/kernel/time.c as follows: warning: symbol 'decrementer_max' was not declared. Should it be static? warning: symbol 'rtc_lock' was not declared. Should it be static? warning: symbol 'dtl_consumer' was not declared. Should it be static? Declare 'decrementer_max' in powerpc asm/time.h. Include linux/mc146818rtc.h in powerpc kernel/time.c where 'rtc_lock' is declared. And remove duplicated declaration of 'rtc_lock' in powerpc platforms/chrp/time.c because it has included linux/mc146818rtc.h. Move 'dtl_consumer' definition after "include " because it is declared there. Reported-by: Hulk Robot Signed-off-by: He Ying Reviewed-by: Christophe Leroy Reviewed-by: Alexandre Belloni Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210324090939.143477-1-heying24@huawei.com --- arch/powerpc/include/asm/time.h | 1 + arch/powerpc/kernel/time.c | 9 ++++----- arch/powerpc/platforms/chrp/time.c | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 924b2157882f..1e5643a9b1f2 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -24,6 +24,7 @@ extern unsigned long tb_ticks_per_jiffy; extern unsigned long tb_ticks_per_usec; extern unsigned long tb_ticks_per_sec; extern struct clock_event_device decrementer_clockevent; +extern u64 decrementer_max; extern void generic_calibrate_decr(void); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f5cbfe5efd25..caeb481a8dea 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -54,8 +54,9 @@ #include #include #include -#include +#include +#include #include #include #include @@ -156,10 +157,6 @@ bool tb_invalid; u64 __cputime_usec_factor; EXPORT_SYMBOL(__cputime_usec_factor); -#ifdef CONFIG_PPC_SPLPAR -void (*dtl_consumer)(struct dtl_entry *, u64); -#endif - static void calc_cputime_factors(void) { struct div_result res; @@ -185,6 +182,8 @@ static inline unsigned long read_spurr(unsigned long tb) #include +void (*dtl_consumer)(struct dtl_entry *, u64); + /* * Scan the dispatch trace log and count up the stolen time. * Should be called with interrupts disabled. diff --git a/arch/powerpc/platforms/chrp/time.c b/arch/powerpc/platforms/chrp/time.c index acde7bbe0716..b94dfd5090d8 100644 --- a/arch/powerpc/platforms/chrp/time.c +++ b/arch/powerpc/platforms/chrp/time.c @@ -30,8 +30,6 @@ #include -extern spinlock_t rtc_lock; - #define NVRAM_AS0 0x74 #define NVRAM_AS1 0x75 #define NVRAM_DATA 0x77 -- cgit v1.2.3 From 0a3ef48c2fac4691db6060df54723e46e0c69b2a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 24 Mar 2021 22:07:14 +0800 Subject: powerpc/eeh: Remove unused inline function eeh_dev_phb_init_dynamic() commit 475028efc708 ("powerpc/eeh: Remove eeh_dev_phb_init_dynamic()") left behind this, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Daniel Axtens Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210324140714.19612-1-yuehaibing@huawei.com --- arch/powerpc/include/asm/eeh.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index bd513fd49be9..7b2b3798dea0 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -333,8 +333,6 @@ static inline bool eeh_enabled(void) static inline void eeh_show_enabled(void) { } -static inline void eeh_dev_phb_init_dynamic(struct pci_controller *phb) { } - static inline int eeh_check_failure(const volatile void __iomem *token) { return 0; -- cgit v1.2.3 From 44c10404c13604d1c98e285ff1c4bf821da80240 Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Mon, 21 Feb 2022 20:07:41 -0300 Subject: powerpc: Fix missing declaration of [en/dis]able_kernel_altivec() When CONFIG_PPC64 is set and CONFIG_ALTIVEC is not the following build failures occur: drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/dc_fpu.c: In function 'dc_fpu_begin': >> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/dc_fpu.c:61:17: error: implicit declaration of function 'enable_kernel_altivec'; did you mean 'enable_kernel_vsx'? [-Werror=implicit-function-declaration] 61 | enable_kernel_altivec(); | ^~~~~~~~~~~~~~~~~~~~~ | enable_kernel_vsx drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/dc_fpu.c: In function 'dc_fpu_end': >> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/dc_fpu.c:89:17: error: implicit declaration of function 'disable_kernel_altivec'; did you mean 'disable_kernel_vsx'? [-Werror=implicit-function-declaration] 89 | disable_kernel_altivec(); | ^~~~~~~~~~~~~~~~~~~~~~ | disable_kernel_vsx cc1: some warnings being treated as errors This commit adds stub instances of both enable_kernel_altivec() and disable_kernel_altivec() the same way as done in commit bd73758803c2 regarding enable_kernel_vsx() and disable_kernel_vsx(). Reported-by: kernel test robot Signed-off-by: Magali Lemes Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220221230741.293064-1-magalilemes00@gmail.com --- arch/powerpc/include/asm/switch_to.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 1f43ef696033..aee25e3ebf96 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -62,6 +62,15 @@ static inline void disable_kernel_altivec(void) #else static inline void save_altivec(struct task_struct *t) { } static inline void __giveup_altivec(struct task_struct *t) { } +static inline void enable_kernel_altivec(void) +{ + BUILD_BUG(); +} + +static inline void disable_kernel_altivec(void) +{ + BUILD_BUG(); +} #endif #ifdef CONFIG_VSX -- cgit v1.2.3 From dc7a98b89b0c209e6ccba0a1f6390adbc36d5ca4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 24 Mar 2021 22:07:52 +0800 Subject: powerpc/smp: Remove unused inline functions commit 441c19c8a290 ("powerpc/kvm/book3s_hv: Rework the secondary inhibit code") left behind this, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Daniel Axtens Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210324140752.11320-1-yuehaibing@huawei.com --- arch/powerpc/include/asm/smp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 60ab739a5e3b..f63505d74932 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -189,8 +189,6 @@ extern void __cpu_die(unsigned int cpu); #define smp_setup_cpu_maps() #define thread_group_shares_l2 0 #define thread_group_shares_l3 0 -static inline void inhibit_secondary_onlining(void) {} -static inline void uninhibit_secondary_onlining(void) {} static inline const struct cpumask *cpu_sibling_mask(int cpu) { return cpumask_of(cpu); -- cgit v1.2.3 From 5e6ec1ad2e89c9b9d4047778748e42f4450fb381 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 11 Mar 2022 21:00:17 +0800 Subject: powerpc/kuap: Remove unused inline function __kuap_assert_locked() commit 2341964e27b0 ("powerpc/kuap: Remove __kuap_assert_locked()") left behind this one, remove it. Signed-off-by: YueHaibing Acked-by: Christophe Leroy Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220311130017.22936-1-yuehaibing@huawei.com --- arch/powerpc/include/asm/kup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index fb2237809d63..d751ddd08110 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -52,7 +52,6 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return false; } -static inline void __kuap_assert_locked(void) { } static inline void __kuap_lock(void) { } static inline void __kuap_save_and_lock(struct pt_regs *regs) { } static inline void kuap_user_restore(struct pt_regs *regs) { } -- cgit v1.2.3 From f06351f8c0c85e2d53e73c53a33b4ef55b4ad6de Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 16 Mar 2022 18:42:39 +0800 Subject: powerpc/eeh: Remove unused inline functions pseries_eeh_init_edev() is used exclusively in eeh_pseries.c, make it static and remove unused inline function. pseries_eeh_init_edev_recursive() is only called from files build wich CONFIG_HOTPLUG_PCI_RPA which depends on CONFIG_PSERIES and CONFIG_EEH, so can remove the unused inline version. Suggested-by: Christophe Leroy Signed-off-by: YueHaibing Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220316104239.26508-1-yuehaibing@huawei.com --- arch/powerpc/include/asm/eeh.h | 4 ---- arch/powerpc/platforms/pseries/eeh_pseries.c | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 7b2b3798dea0..514dd056c2c8 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -352,11 +352,7 @@ static inline int eeh_phb_pe_create(struct pci_controller *phb) { return 0; } #endif /* CONFIG_EEH */ #if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_EEH) -void pseries_eeh_init_edev(struct pci_dn *pdn); void pseries_eeh_init_edev_recursive(struct pci_dn *pdn); -#else -static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { } -static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { } #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 09fafcf2d3a0..f9af879c0222 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -43,6 +43,8 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; +static void pseries_eeh_init_edev(struct pci_dn *pdn); + static void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); @@ -359,7 +361,7 @@ static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev) * This function takes care of the initialisation and inserts the eeh_dev * into the correct eeh_pe. If no eeh_pe exists we'll allocate one. */ -void pseries_eeh_init_edev(struct pci_dn *pdn) +static void pseries_eeh_init_edev(struct pci_dn *pdn) { struct eeh_pe pe, *parent; struct eeh_dev *edev; -- cgit v1.2.3 From 1408fca0c198471a5cd089742b9d3f9739073483 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:30 +0200 Subject: powerpc/mm: Make slice specific to book3s/64 Since commit 555904d07eef ("powerpc/8xx: MM_SLICE is not needed anymore") only book3s/64 selects CONFIG_PPC_MM_SLICES. Move slice.c into mm/book3s64/ Move necessary stuff in asm/book3s/64/slice.h and remove asm/slice.h Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4a0d74ef1966a5902b5fd4ac4b513a760a6d675a.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 + arch/powerpc/include/asm/book3s/64/slice.h | 18 + arch/powerpc/include/asm/page.h | 1 - arch/powerpc/include/asm/slice.h | 46 -- arch/powerpc/mm/Makefile | 1 - arch/powerpc/mm/book3s64/Makefile | 1 + arch/powerpc/mm/book3s64/slice.c | 769 +++++++++++++++++++++++++ arch/powerpc/mm/nohash/mmu_context.c | 9 - arch/powerpc/mm/nohash/tlb.c | 4 - arch/powerpc/mm/slice.c | 771 -------------------------- 10 files changed, 789 insertions(+), 832 deletions(-) delete mode 100644 arch/powerpc/include/asm/slice.h create mode 100644 arch/powerpc/mm/book3s64/slice.c delete mode 100644 arch/powerpc/mm/slice.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 21f780942911..1c4eebbc69c9 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -18,6 +18,7 @@ * complete pgtable.h but only a portion of it. */ #include +#include #include #include diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index f0d3194ba41b..5b0f7105bc8b 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H #define _ASM_POWERPC_BOOK3S_64_SLICE_H +#ifndef __ASSEMBLY__ + #define SLICE_LOW_SHIFT 28 #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) @@ -13,4 +15,20 @@ #define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW_USER64 +struct mm_struct; + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown); + +unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr); + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize); + +void slice_init_new_context_exec(struct mm_struct *mm); +void slice_setup_new_exec(void); + +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index f2c5c26869f1..b3101ffe0151 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -333,6 +333,5 @@ static inline unsigned long kaslr_offset(void) #include #endif /* __ASSEMBLY__ */ -#include #endif /* _ASM_POWERPC_PAGE_H */ diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h deleted file mode 100644 index 0bdd9c62eca0..000000000000 --- a/arch/powerpc/include/asm/slice.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_SLICE_H -#define _ASM_POWERPC_SLICE_H - -#ifdef CONFIG_PPC_BOOK3S_64 -#include -#endif - -#ifndef __ASSEMBLY__ - -struct mm_struct; - -#ifdef CONFIG_PPC_MM_SLICES - -#ifdef CONFIG_HUGETLB_PAGE -#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -#endif -#define HAVE_ARCH_UNMAPPED_AREA -#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN - -unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, - unsigned long flags, unsigned int psize, - int topdown); - -unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr); - -void slice_set_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long len, unsigned int psize); - -void slice_init_new_context_exec(struct mm_struct *mm); -void slice_setup_new_exec(void); - -#else /* CONFIG_PPC_MM_SLICES */ - -static inline void slice_init_new_context_exec(struct mm_struct *mm) {} - -static inline unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -#endif /* CONFIG_PPC_MM_SLICES */ - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_POWERPC_SLICE_H */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index df8172da2301..d4c20484dad9 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 2d50cac499c5..af2f3e75d458 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o obj-$(CONFIG_PPC_PKEY) += pkeys.o +obj-$(CONFIG_PPC_MM_SLICES) += slice.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c new file mode 100644 index 000000000000..e4382713746d --- /dev/null +++ b/arch/powerpc/mm/book3s64/slice.c @@ -0,0 +1,769 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * address space "slices" (meta-segments) support + * + * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. + * + * Based on hugetlb implementation + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(slice_convert_lock); + +#ifdef DEBUG +int _slice_debug = 1; + +static void slice_print_mask(const char *label, const struct slice_mask *mask) +{ + if (!_slice_debug) + return; + pr_devel("%s low_slice: %*pbl\n", label, + (int)SLICE_NUM_LOW, &mask->low_slices); + pr_devel("%s high_slice: %*pbl\n", label, + (int)SLICE_NUM_HIGH, mask->high_slices); +} + +#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) + +#else + +static void slice_print_mask(const char *label, const struct slice_mask *mask) {} +#define slice_dbg(fmt...) + +#endif + +static inline notrace bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) +{ + unsigned long end = start + len - 1; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + + bitmap_set(ret->high_slices, start_index, count); + } +} + +static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) + return 0; + vma = find_vma(mm, addr); + return (!vma || (addr + len) <= vm_start_gap(vma)); +} + +static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) +{ + return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, + 1ul << SLICE_LOW_SHIFT); +} + +static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) +{ + unsigned long start = slice << SLICE_HIGH_SHIFT; + unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); + + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = (unsigned long)SLICE_LOW_TOP; + + return !slice_area_is_free(mm, start, end - start); +} + +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, + unsigned long high_limit) +{ + unsigned long i; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + for (i = 0; i < SLICE_NUM_LOW; i++) + if (!slice_low_has_vma(mm, i)) + ret->low_slices |= 1u << i; + + if (slice_addr_is_low(high_limit - 1)) + return; + + for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) + if (!slice_high_has_vma(mm, i)) + __set_bit(i, ret->high_slices); +} + +static bool slice_check_range_fits(struct mm_struct *mm, + const struct slice_mask *available, + unsigned long start, unsigned long len) +{ + unsigned long end = start + len - 1; + u64 low_slices = 0; + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + if ((low_slices & available->low_slices) != low_slices) + return false; + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + unsigned long i; + + for (i = start_index; i < start_index + count; i++) { + if (!test_bit(i, available->high_slices)) + return false; + } + } + + return true; +} + +static void slice_flush_segments(void *parm) +{ +#ifdef CONFIG_PPC64 + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + copy_mm_to_paca(current->active_mm); + + local_irq_save(flags); + slb_flush_and_restore_bolted(); + local_irq_restore(flags); +#endif +} + +static void slice_convert(struct mm_struct *mm, + const struct slice_mask *mask, int psize) +{ + int index, mask_index; + /* Write the new slice psize bits */ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *psize_mask, *old_mask; + unsigned long i, flags; + int old_psize; + + slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); + slice_print_mask(" mask", mask); + + psize_mask = slice_mask_for_size(&mm->context, psize); + + /* We need to use a spinlock here to protect against + * concurrent 64k -> 4k demotion ... + */ + spin_lock_irqsave(&slice_convert_lock, flags); + + lpsizes = mm_ctx_low_slices(&mm->context); + for (i = 0; i < SLICE_NUM_LOW; i++) { + if (!(mask->low_slices & (1u << i))) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + old_mask->low_slices &= ~(1u << i); + psize_mask->low_slices |= 1u << i; + + /* Update the sizes array */ + lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { + if (!test_bit(i, mask->high_slices)) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + __clear_bit(i, old_mask->high_slices); + __set_bit(i, psize_mask->high_slices); + + /* Update the sizes array */ + hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + slice_dbg(" lsps=%lx, hsps=%lx\n", + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); + + spin_unlock_irqrestore(&slice_convert_lock, flags); + + copro_flush_all_slbs(mm); +} + +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + const struct slice_mask *available, + int end, unsigned long *boundary_addr) +{ + unsigned long slice; + if (slice_addr_is_low(addr)) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available->low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!test_bit(slice, available->high_slices); + } +} + +static unsigned long slice_find_area_bottomup(struct mm_struct *mm, + unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, next_end; + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = TASK_UNMAPPED_BASE; + /* + * Check till the allow max value for this mmap request + */ + while (addr < high_limit) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) + continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= high_limit) + addr = high_limit; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; + } + info.high_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + return -ENOMEM; +} + +static unsigned long slice_find_area_topdown(struct mm_struct *mm, + unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, prev; + struct vm_unmapped_area_info info; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = mm->mmap_base; + /* + * If we are trying to allocate above DEFAULT_MAP_WINDOW + * Add the different to the mmap_base. + * Only for that request for which high_limit is above + * DEFAULT_MAP_WINDOW we should apply this. + */ + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; + + while (addr > min_addr) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) + continue; + + prev_slice: + /* + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. + */ + if (addr < min_addr) + addr = min_addr; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; + } + info.low_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return slice_find_area_bottomup(mm, len, available, psize, high_limit); +} + + +static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, + const struct slice_mask *mask, int psize, + int topdown, unsigned long high_limit) +{ + if (topdown) + return slice_find_area_topdown(mm, len, mask, psize, high_limit); + else + return slice_find_area_bottomup(mm, len, mask, psize, high_limit); +} + +static inline void slice_copy_mask(struct slice_mask *dst, + const struct slice_mask *src) +{ + dst->low_slices = src->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_or_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices | src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_andnot_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices & ~src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown) +{ + struct slice_mask good_mask; + struct slice_mask potential_mask; + const struct slice_mask *maskp; + const struct slice_mask *compat_maskp = NULL; + int fixed = (flags & MAP_FIXED); + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long page_size = 1UL << pshift; + struct mm_struct *mm = current->mm; + unsigned long newaddr; + unsigned long high_limit; + + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (len & (page_size - 1)) + return -EINVAL; + if (fixed) { + if (addr & (page_size - 1)) + return -EINVAL; + if (addr > high_limit - len) + return -ENOMEM; + } + + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { + /* + * Increasing the slb_addr_limit does not require + * slice mask cache to be recalculated because it should + * be already initialised beyond the old address limit. + */ + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); + + on_each_cpu(slice_flush_segments, mm, 1); + } + + /* Sanity checks */ + BUG_ON(mm->task_size == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); + VM_BUG_ON(radix_enabled()); + + slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); + + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = ALIGN(addr, page_size); + slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > high_limit - len || addr < mmap_min_addr || + !slice_area_is_free(mm, addr, len)) + addr = 0; + } + + /* First make up a "good" mask of slices that have the right size + * already + */ + maskp = slice_mask_for_size(&mm->context, psize); + + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ + + /* + * If we support combo pages, we can allow 64k pages in 4k slices + * The mask copies could be avoided in most cases here if we had + * a pointer to good mask for the next code to use. + */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + if (fixed) + slice_or_mask(&good_mask, maskp, compat_maskp); + else + slice_copy_mask(&good_mask, maskp); + } else { + slice_copy_mask(&good_mask, maskp); + } + + slice_print_mask(" good_mask", &good_mask); + if (compat_maskp) + slice_print_mask(" compat_mask", compat_maskp); + + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { + /* Check if we fit in the good mask. If we do, we just return, + * nothing else to do + */ + if (slice_check_range_fits(mm, &good_mask, addr, len)) { + slice_dbg(" fits good !\n"); + newaddr = addr; + goto return_addr; + } + } else { + /* Now let's see if we can find something in the existing + * slices for that size + */ + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + /* + * We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + slice_mask_for_free(mm, &potential_mask, high_limit); + slice_or_mask(&potential_mask, &potential_mask, &good_mask); + slice_print_mask(" potential", &potential_mask); + + if (addr != 0 || fixed) { + if (slice_check_range_fits(mm, &potential_mask, addr, len)) { + slice_dbg(" fits potential !\n"); + newaddr = addr; + goto convert; + } + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ + if (fixed) + return -EBUSY; + + slice_dbg(" search...\n"); + + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. + */ + if (addr) { + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + + /* Now let's see if we can find something in the existing slices + * for that size plus free slices + */ + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + slice_or_mask(&potential_mask, &potential_mask, compat_maskp); + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + } + + if (newaddr == -ENOMEM) + return -ENOMEM; + + slice_range_to_mask(newaddr, len, &potential_mask); + slice_dbg(" found potential area at 0x%lx\n", newaddr); + slice_print_mask(" mask", &potential_mask); + + convert: + /* + * Try to allocate the context before we do slice convert + * so that we handle the context allocation failure gracefully. + */ + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + + slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); + if (compat_maskp && !fixed) + slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); + if (potential_mask.low_slices || + (SLICE_NUM_HIGH && + !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { + slice_convert(mm, &potential_mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } + return newaddr; + +return_addr: + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + return newaddr; +} +EXPORT_SYMBOL_GPL(slice_get_unmapped_area); + +unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + unsigned char *psizes; + int index, mask_index; + + VM_BUG_ON(radix_enabled()); + + if (slice_addr_is_low(addr)) { + psizes = mm_ctx_low_slices(&mm->context); + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = mm_ctx_high_slices(&mm->context); + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xf; +} +EXPORT_SYMBOL_GPL(get_slice_psize); + +void slice_init_new_context_exec(struct mm_struct *mm) +{ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *mask; + unsigned int psize = mmu_virtual_psize; + + slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm); + + /* + * In the case of exec, use the default limit. In the + * case of fork it is just inherited from the mm being + * duplicated. + */ + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); + mm_ctx_set_user_psize(&mm->context, psize); + + /* + * Set all slice psizes to the default. + */ + lpsizes = mm_ctx_low_slices(&mm->context); + memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); + + hpsizes = mm_ctx_high_slices(&mm->context); + memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); + + /* + * Slice mask cache starts zeroed, fill the default size cache. + */ + mask = slice_mask_for_size(&mm->context, psize); + mask->low_slices = ~0UL; + if (SLICE_NUM_HIGH) + bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); +} + +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask; + + VM_BUG_ON(radix_enabled()); + + slice_range_to_mask(start, len, &mask); + slice_convert(mm, &mask, psize); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * is_hugepage_only_range() is used by generic code to verify whether + * a normal mmap mapping (non hugetlbfs) is valid on a given area. + * + * until the generic code provides a more generic hook and/or starts + * calling arch get_unmapped_area for MAP_FIXED (which our implementation + * here knows how to deal with), we hijack it to keep standard mappings + * away from us. + * + * because of that generic code limitation, MAP_FIXED mapping cannot + * "convert" back a slice with no VMAs to the standard page size, only + * get_unmapped_area() can. It would be possible to fix it here but I + * prefer working on fixing the generic code instead. + * + * WARNING: This will not work if hugetlbfs isn't enabled since the + * generic code will redefine that function as 0 in that. This is ok + * for now as we only use slices with hugetlbfs enabled. This should + * be fixed as the generic code gets fixed. + */ +int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + const struct slice_mask *maskp; + unsigned int psize = mm_ctx_user_psize(&mm->context); + + VM_BUG_ON(radix_enabled()); + + maskp = slice_mask_for_size(&mm->context, psize); + + /* We need to account for 4k slices too */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + const struct slice_mask *compat_maskp; + struct slice_mask available; + + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + slice_or_mask(&available, maskp, compat_maskp); + return !slice_check_range_fits(mm, &available, addr, len); + } + + return !slice_check_range_fits(mm, maskp, addr, len); +} + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + /* With radix we don't use slice, so derive it from vma*/ + if (radix_enabled()) + return vma_kernel_pagesize(vma); + + return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); +} +#endif diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 85b048f04c56..ccd5819b1bd9 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -317,15 +317,6 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { - /* - * We have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - */ - if (mm->context.id == 0) - slice_init_new_context_exec(mm); mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; pte_frag_set(&mm->context, NULL); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index fd2c77af5c55..7e1e7c3dc66a 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -773,9 +773,5 @@ void __init early_init_mmu(void) #ifdef CONFIG_PPC_47x early_init_mmu_47x(); #endif - -#ifdef CONFIG_PPC_MM_SLICES - mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); -#endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c deleted file mode 100644 index 8a3ac062b71e..000000000000 --- a/arch/powerpc/mm/slice.c +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * address space "slices" (meta-segments) support - * - * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. - * - * Based on hugetlb implementation - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - */ - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(slice_convert_lock); - -#ifdef DEBUG -int _slice_debug = 1; - -static void slice_print_mask(const char *label, const struct slice_mask *mask) -{ - if (!_slice_debug) - return; - pr_devel("%s low_slice: %*pbl\n", label, - (int)SLICE_NUM_LOW, &mask->low_slices); - pr_devel("%s high_slice: %*pbl\n", label, - (int)SLICE_NUM_HIGH, mask->high_slices); -} - -#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) - -#else - -static void slice_print_mask(const char *label, const struct slice_mask *mask) {} -#define slice_dbg(fmt...) - -#endif - -static inline notrace bool slice_addr_is_low(unsigned long addr) -{ - u64 tmp = (u64)addr; - - return tmp < SLICE_LOW_TOP; -} - -static void slice_range_to_mask(unsigned long start, unsigned long len, - struct slice_mask *ret) -{ - unsigned long end = start + len - 1; - - ret->low_slices = 0; - if (SLICE_NUM_HIGH) - bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); - - if (slice_addr_is_low(start)) { - unsigned long mend = min(end, - (unsigned long)(SLICE_LOW_TOP - 1)); - - ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - - (1u << GET_LOW_SLICE_INDEX(start)); - } - - if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { - unsigned long start_index = GET_HIGH_SLICE_INDEX(start); - unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); - unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; - - bitmap_set(ret->high_slices, start_index, count); - } -} - -static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - struct vm_area_struct *vma; - - if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) - return 0; - vma = find_vma(mm, addr); - return (!vma || (addr + len) <= vm_start_gap(vma)); -} - -static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) -{ - return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, - 1ul << SLICE_LOW_SHIFT); -} - -static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) -{ - unsigned long start = slice << SLICE_HIGH_SHIFT; - unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); - - /* Hack, so that each addresses is controlled by exactly one - * of the high or low area bitmaps, the first high area starts - * at 4GB, not 0 */ - if (start == 0) - start = (unsigned long)SLICE_LOW_TOP; - - return !slice_area_is_free(mm, start, end - start); -} - -static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, - unsigned long high_limit) -{ - unsigned long i; - - ret->low_slices = 0; - if (SLICE_NUM_HIGH) - bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); - - for (i = 0; i < SLICE_NUM_LOW; i++) - if (!slice_low_has_vma(mm, i)) - ret->low_slices |= 1u << i; - - if (slice_addr_is_low(high_limit - 1)) - return; - - for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) - if (!slice_high_has_vma(mm, i)) - __set_bit(i, ret->high_slices); -} - -static bool slice_check_range_fits(struct mm_struct *mm, - const struct slice_mask *available, - unsigned long start, unsigned long len) -{ - unsigned long end = start + len - 1; - u64 low_slices = 0; - - if (slice_addr_is_low(start)) { - unsigned long mend = min(end, - (unsigned long)(SLICE_LOW_TOP - 1)); - - low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - - (1u << GET_LOW_SLICE_INDEX(start)); - } - if ((low_slices & available->low_slices) != low_slices) - return false; - - if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { - unsigned long start_index = GET_HIGH_SLICE_INDEX(start); - unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); - unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; - unsigned long i; - - for (i = start_index; i < start_index + count; i++) { - if (!test_bit(i, available->high_slices)) - return false; - } - } - - return true; -} - -static void slice_flush_segments(void *parm) -{ -#ifdef CONFIG_PPC64 - struct mm_struct *mm = parm; - unsigned long flags; - - if (mm != current->active_mm) - return; - - copy_mm_to_paca(current->active_mm); - - local_irq_save(flags); - slb_flush_and_restore_bolted(); - local_irq_restore(flags); -#endif -} - -static void slice_convert(struct mm_struct *mm, - const struct slice_mask *mask, int psize) -{ - int index, mask_index; - /* Write the new slice psize bits */ - unsigned char *hpsizes, *lpsizes; - struct slice_mask *psize_mask, *old_mask; - unsigned long i, flags; - int old_psize; - - slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); - slice_print_mask(" mask", mask); - - psize_mask = slice_mask_for_size(&mm->context, psize); - - /* We need to use a spinlock here to protect against - * concurrent 64k -> 4k demotion ... - */ - spin_lock_irqsave(&slice_convert_lock, flags); - - lpsizes = mm_ctx_low_slices(&mm->context); - for (i = 0; i < SLICE_NUM_LOW; i++) { - if (!(mask->low_slices & (1u << i))) - continue; - - mask_index = i & 0x1; - index = i >> 1; - - /* Update the slice_mask */ - old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(&mm->context, old_psize); - old_mask->low_slices &= ~(1u << i); - psize_mask->low_slices |= 1u << i; - - /* Update the sizes array */ - lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } - - hpsizes = mm_ctx_high_slices(&mm->context); - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { - if (!test_bit(i, mask->high_slices)) - continue; - - mask_index = i & 0x1; - index = i >> 1; - - /* Update the slice_mask */ - old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(&mm->context, old_psize); - __clear_bit(i, old_mask->high_slices); - __set_bit(i, psize_mask->high_slices); - - /* Update the sizes array */ - hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } - - slice_dbg(" lsps=%lx, hsps=%lx\n", - (unsigned long)mm_ctx_low_slices(&mm->context), - (unsigned long)mm_ctx_high_slices(&mm->context)); - - spin_unlock_irqrestore(&slice_convert_lock, flags); - - copro_flush_all_slbs(mm); -} - -/* - * Compute which slice addr is part of; - * set *boundary_addr to the start or end boundary of that slice - * (depending on 'end' parameter); - * return boolean indicating if the slice is marked as available in the - * 'available' slice_mark. - */ -static bool slice_scan_available(unsigned long addr, - const struct slice_mask *available, - int end, unsigned long *boundary_addr) -{ - unsigned long slice; - if (slice_addr_is_low(addr)) { - slice = GET_LOW_SLICE_INDEX(addr); - *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; - return !!(available->low_slices & (1u << slice)); - } else { - slice = GET_HIGH_SLICE_INDEX(addr); - *boundary_addr = (slice + end) ? - ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!test_bit(slice, available->high_slices); - } -} - -static unsigned long slice_find_area_bottomup(struct mm_struct *mm, - unsigned long len, - const struct slice_mask *available, - int psize, unsigned long high_limit) -{ - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, next_end; - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = TASK_UNMAPPED_BASE; - /* - * Check till the allow max value for this mmap request - */ - while (addr < high_limit) { - info.low_limit = addr; - if (!slice_scan_available(addr, available, 1, &addr)) - continue; - - next_slice: - /* - * At this point [info.low_limit; addr) covers - * available slices only and ends at a slice boundary. - * Check if we need to reduce the range, or if we can - * extend it to cover the next available slice. - */ - if (addr >= high_limit) - addr = high_limit; - else if (slice_scan_available(addr, available, 1, &next_end)) { - addr = next_end; - goto next_slice; - } - info.high_limit = addr; - - found = vm_unmapped_area(&info); - if (!(found & ~PAGE_MASK)) - return found; - } - - return -ENOMEM; -} - -static unsigned long slice_find_area_topdown(struct mm_struct *mm, - unsigned long len, - const struct slice_mask *available, - int psize, unsigned long high_limit) -{ - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, prev; - struct vm_unmapped_area_info info; - unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = mm->mmap_base; - /* - * If we are trying to allocate above DEFAULT_MAP_WINDOW - * Add the different to the mmap_base. - * Only for that request for which high_limit is above - * DEFAULT_MAP_WINDOW we should apply this. - */ - if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; - - while (addr > min_addr) { - info.high_limit = addr; - if (!slice_scan_available(addr - 1, available, 0, &addr)) - continue; - - prev_slice: - /* - * At this point [addr; info.high_limit) covers - * available slices only and starts at a slice boundary. - * Check if we need to reduce the range, or if we can - * extend it to cover the previous available slice. - */ - if (addr < min_addr) - addr = min_addr; - else if (slice_scan_available(addr - 1, available, 0, &prev)) { - addr = prev; - goto prev_slice; - } - info.low_limit = addr; - - found = vm_unmapped_area(&info); - if (!(found & ~PAGE_MASK)) - return found; - } - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - return slice_find_area_bottomup(mm, len, available, psize, high_limit); -} - - -static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, - const struct slice_mask *mask, int psize, - int topdown, unsigned long high_limit) -{ - if (topdown) - return slice_find_area_topdown(mm, len, mask, psize, high_limit); - else - return slice_find_area_bottomup(mm, len, mask, psize, high_limit); -} - -static inline void slice_copy_mask(struct slice_mask *dst, - const struct slice_mask *src) -{ - dst->low_slices = src->low_slices; - if (!SLICE_NUM_HIGH) - return; - bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH); -} - -static inline void slice_or_mask(struct slice_mask *dst, - const struct slice_mask *src1, - const struct slice_mask *src2) -{ - dst->low_slices = src1->low_slices | src2->low_slices; - if (!SLICE_NUM_HIGH) - return; - bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); -} - -static inline void slice_andnot_mask(struct slice_mask *dst, - const struct slice_mask *src1, - const struct slice_mask *src2) -{ - dst->low_slices = src1->low_slices & ~src2->low_slices; - if (!SLICE_NUM_HIGH) - return; - bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); -} - -#ifdef CONFIG_PPC_64K_PAGES -#define MMU_PAGE_BASE MMU_PAGE_64K -#else -#define MMU_PAGE_BASE MMU_PAGE_4K -#endif - -unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, - unsigned long flags, unsigned int psize, - int topdown) -{ - struct slice_mask good_mask; - struct slice_mask potential_mask; - const struct slice_mask *maskp; - const struct slice_mask *compat_maskp = NULL; - int fixed = (flags & MAP_FIXED); - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long page_size = 1UL << pshift; - struct mm_struct *mm = current->mm; - unsigned long newaddr; - unsigned long high_limit; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - if (len & (page_size - 1)) - return -EINVAL; - if (fixed) { - if (addr & (page_size - 1)) - return -EINVAL; - if (addr > high_limit - len) - return -ENOMEM; - } - - if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { - /* - * Increasing the slb_addr_limit does not require - * slice mask cache to be recalculated because it should - * be already initialised beyond the old address limit. - */ - mm_ctx_set_slb_addr_limit(&mm->context, high_limit); - - on_each_cpu(slice_flush_segments, mm, 1); - } - - /* Sanity checks */ - BUG_ON(mm->task_size == 0); - BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); - VM_BUG_ON(radix_enabled()); - - slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); - slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", - addr, len, flags, topdown); - - /* If hint, make sure it matches our alignment restrictions */ - if (!fixed && addr) { - addr = ALIGN(addr, page_size); - slice_dbg(" aligned addr=%lx\n", addr); - /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > high_limit - len || addr < mmap_min_addr || - !slice_area_is_free(mm, addr, len)) - addr = 0; - } - - /* First make up a "good" mask of slices that have the right size - * already - */ - maskp = slice_mask_for_size(&mm->context, psize); - - /* - * Here "good" means slices that are already the right page size, - * "compat" means slices that have a compatible page size (i.e. - * 4k in a 64k pagesize kernel), and "free" means slices without - * any VMAs. - * - * If MAP_FIXED: - * check if fits in good | compat => OK - * check if fits in good | compat | free => convert free - * else bad - * If have hint: - * check if hint fits in good => OK - * check if hint fits in good | free => convert free - * Otherwise: - * search in good, found => OK - * search in good | free, found => convert free - * search in good | compat | free, found => convert free. - */ - - /* - * If we support combo pages, we can allow 64k pages in 4k slices - * The mask copies could be avoided in most cases here if we had - * a pointer to good mask for the next code to use. - */ - if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { - compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); - if (fixed) - slice_or_mask(&good_mask, maskp, compat_maskp); - else - slice_copy_mask(&good_mask, maskp); - } else { - slice_copy_mask(&good_mask, maskp); - } - - slice_print_mask(" good_mask", &good_mask); - if (compat_maskp) - slice_print_mask(" compat_mask", compat_maskp); - - /* First check hint if it's valid or if we have MAP_FIXED */ - if (addr != 0 || fixed) { - /* Check if we fit in the good mask. If we do, we just return, - * nothing else to do - */ - if (slice_check_range_fits(mm, &good_mask, addr, len)) { - slice_dbg(" fits good !\n"); - newaddr = addr; - goto return_addr; - } - } else { - /* Now let's see if we can find something in the existing - * slices for that size - */ - newaddr = slice_find_area(mm, len, &good_mask, - psize, topdown, high_limit); - if (newaddr != -ENOMEM) { - /* Found within the good mask, we don't have to setup, - * we thus return directly - */ - slice_dbg(" found area at 0x%lx\n", newaddr); - goto return_addr; - } - } - /* - * We don't fit in the good mask, check what other slices are - * empty and thus can be converted - */ - slice_mask_for_free(mm, &potential_mask, high_limit); - slice_or_mask(&potential_mask, &potential_mask, &good_mask); - slice_print_mask(" potential", &potential_mask); - - if (addr != 0 || fixed) { - if (slice_check_range_fits(mm, &potential_mask, addr, len)) { - slice_dbg(" fits potential !\n"); - newaddr = addr; - goto convert; - } - } - - /* If we have MAP_FIXED and failed the above steps, then error out */ - if (fixed) - return -EBUSY; - - slice_dbg(" search...\n"); - - /* If we had a hint that didn't work out, see if we can fit - * anywhere in the good area. - */ - if (addr) { - newaddr = slice_find_area(mm, len, &good_mask, - psize, topdown, high_limit); - if (newaddr != -ENOMEM) { - slice_dbg(" found area at 0x%lx\n", newaddr); - goto return_addr; - } - } - - /* Now let's see if we can find something in the existing slices - * for that size plus free slices - */ - newaddr = slice_find_area(mm, len, &potential_mask, - psize, topdown, high_limit); - - if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && - psize == MMU_PAGE_64K) { - /* retry the search with 4k-page slices included */ - slice_or_mask(&potential_mask, &potential_mask, compat_maskp); - newaddr = slice_find_area(mm, len, &potential_mask, - psize, topdown, high_limit); - } - - if (newaddr == -ENOMEM) - return -ENOMEM; - - slice_range_to_mask(newaddr, len, &potential_mask); - slice_dbg(" found potential area at 0x%lx\n", newaddr); - slice_print_mask(" mask", &potential_mask); - - convert: - /* - * Try to allocate the context before we do slice convert - * so that we handle the context allocation failure gracefully. - */ - if (need_extra_context(mm, newaddr)) { - if (alloc_extended_context(mm, newaddr) < 0) - return -ENOMEM; - } - - slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); - if (compat_maskp && !fixed) - slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); - if (potential_mask.low_slices || - (SLICE_NUM_HIGH && - !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { - slice_convert(mm, &potential_mask, psize); - if (psize > MMU_PAGE_BASE) - on_each_cpu(slice_flush_segments, mm, 1); - } - return newaddr; - -return_addr: - if (need_extra_context(mm, newaddr)) { - if (alloc_extended_context(mm, newaddr) < 0) - return -ENOMEM; - } - return newaddr; -} -EXPORT_SYMBOL_GPL(slice_get_unmapped_area); - -unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) -{ - unsigned char *psizes; - int index, mask_index; - - VM_BUG_ON(radix_enabled()); - - if (slice_addr_is_low(addr)) { - psizes = mm_ctx_low_slices(&mm->context); - index = GET_LOW_SLICE_INDEX(addr); - } else { - psizes = mm_ctx_high_slices(&mm->context); - index = GET_HIGH_SLICE_INDEX(addr); - } - mask_index = index & 0x1; - return (psizes[index >> 1] >> (mask_index * 4)) & 0xf; -} -EXPORT_SYMBOL_GPL(get_slice_psize); - -void slice_init_new_context_exec(struct mm_struct *mm) -{ - unsigned char *hpsizes, *lpsizes; - struct slice_mask *mask; - unsigned int psize = mmu_virtual_psize; - - slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm); - - /* - * In the case of exec, use the default limit. In the - * case of fork it is just inherited from the mm being - * duplicated. - */ - mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); - mm_ctx_set_user_psize(&mm->context, psize); - - /* - * Set all slice psizes to the default. - */ - lpsizes = mm_ctx_low_slices(&mm->context); - memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); - - hpsizes = mm_ctx_high_slices(&mm->context); - memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); - - /* - * Slice mask cache starts zeroed, fill the default size cache. - */ - mask = slice_mask_for_size(&mm->context, psize); - mask->low_slices = ~0UL; - if (SLICE_NUM_HIGH) - bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); -} - -#ifdef CONFIG_PPC_BOOK3S_64 -void slice_setup_new_exec(void) -{ - struct mm_struct *mm = current->mm; - - slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); - - if (!is_32bit_task()) - return; - - mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); -} -#endif - -void slice_set_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long len, unsigned int psize) -{ - struct slice_mask mask; - - VM_BUG_ON(radix_enabled()); - - slice_range_to_mask(start, len, &mask); - slice_convert(mm, &mask, psize); -} - -#ifdef CONFIG_HUGETLB_PAGE -/* - * is_hugepage_only_range() is used by generic code to verify whether - * a normal mmap mapping (non hugetlbfs) is valid on a given area. - * - * until the generic code provides a more generic hook and/or starts - * calling arch get_unmapped_area for MAP_FIXED (which our implementation - * here knows how to deal with), we hijack it to keep standard mappings - * away from us. - * - * because of that generic code limitation, MAP_FIXED mapping cannot - * "convert" back a slice with no VMAs to the standard page size, only - * get_unmapped_area() can. It would be possible to fix it here but I - * prefer working on fixing the generic code instead. - * - * WARNING: This will not work if hugetlbfs isn't enabled since the - * generic code will redefine that function as 0 in that. This is ok - * for now as we only use slices with hugetlbfs enabled. This should - * be fixed as the generic code gets fixed. - */ -int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - const struct slice_mask *maskp; - unsigned int psize = mm_ctx_user_psize(&mm->context); - - VM_BUG_ON(radix_enabled()); - - maskp = slice_mask_for_size(&mm->context, psize); - - /* We need to account for 4k slices too */ - if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { - const struct slice_mask *compat_maskp; - struct slice_mask available; - - compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); - slice_or_mask(&available, maskp, compat_maskp); - return !slice_check_range_fits(mm, &available, addr, len); - } - - return !slice_check_range_fits(mm, maskp, addr, len); -} - -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - /* With radix we don't use slice, so derive it from vma*/ - if (radix_enabled()) - return vma_kernel_pagesize(vma); - - return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); -} -#endif -- cgit v1.2.3 From f693d38d9468101587175b1e62d7e4483b51d8f5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:31 +0200 Subject: powerpc/mm: Remove CONFIG_PPC_MM_SLICES CONFIG_PPC_MM_SLICES is always selected by hash book3s/64. CONFIG_PPC_MM_SLICES is never selected by other platforms. Remove it. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/dc2cdc204de8978574bf7c02329b6cfc4db0bce7.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hugetlb.h | 2 +- arch/powerpc/include/asm/paca.h | 7 ------- arch/powerpc/kernel/paca.c | 5 ----- arch/powerpc/mm/book3s64/Makefile | 3 +-- arch/powerpc/mm/book3s64/hash_utils.c | 14 -------------- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/mmap.c | 4 ++-- arch/powerpc/platforms/Kconfig.cputype | 4 ---- 8 files changed, 5 insertions(+), 36 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 6a1a1ac5743b..ef86197d1c0a 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -24,7 +24,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) + if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU) && !radix_enabled()) return slice_is_hugepage_only_range(mm, addr, len); return 0; } diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 8330968ca346..03330b7d835f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -152,16 +152,9 @@ struct paca_struct { struct tlb_core_data tcd; #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_BOOK3S #ifdef CONFIG_PPC_64S_HASH_MMU -#ifdef CONFIG_PPC_MM_SLICES unsigned char mm_ctx_low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; -#else - u16 mm_ctx_user_psize; - u16 mm_ctx_sllp; -#endif -#endif #endif /* diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 39da688a9455..ba593fd60124 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -344,15 +344,10 @@ void copy_mm_to_paca(struct mm_struct *mm) { mm_context_t *context = &mm->context; -#ifdef CONFIG_PPC_MM_SLICES VM_BUG_ON(!mm_ctx_slb_addr_limit(context)); memcpy(&get_paca()->mm_ctx_low_slices_psize, mm_ctx_low_slices(context), LOW_SLICE_ARRAY_SZ); memcpy(&get_paca()->mm_ctx_high_slices_psize, mm_ctx_high_slices(context), TASK_SLICE_ARRAY_SZ(context)); -#else /* CONFIG_PPC_MM_SLICES */ - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif } #endif /* CONFIG_PPC_64S_HASH_MMU */ diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index af2f3e75d458..d527dc8e30a8 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -5,7 +5,7 @@ ccflags-y := $(NO_MINIMAL_TOC) obj-y += mmu_context.o pgtable.o trace.o ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) -obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o @@ -21,7 +21,6 @@ obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o obj-$(CONFIG_PPC_PKEY) += pkeys.o -obj-$(CONFIG_PPC_MM_SLICES) += slice.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 985cabdd7f67..43ecad8f9564 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1264,7 +1264,6 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) return pp; } -#ifdef CONFIG_PPC_MM_SLICES static unsigned int get_paca_psize(unsigned long addr) { unsigned char *psizes; @@ -1281,12 +1280,6 @@ static unsigned int get_paca_psize(unsigned long addr) return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; } -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif /* * Demote a segment to using 4k pages. @@ -1680,7 +1673,6 @@ DEFINE_INTERRUPT_HANDLER(do_hash_fault) } } -#ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { int psize = get_slice_psize(mm, ea); @@ -1697,12 +1689,6 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) return true; } -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7b89f0799d82..f71ac14018e2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -558,7 +558,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return radix__hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); #endif -#ifdef CONFIG_PPC_MM_SLICES +#ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); #endif BUG(); diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index c475cf810aa8..9b0d6e395bc0 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -190,7 +190,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long pgoff, unsigned long flags) { -#ifdef CONFIG_PPC_MM_SLICES +#ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr, len, flags, mm_ctx_user_psize(¤t->mm->context), 0); #else @@ -204,7 +204,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long pgoff, const unsigned long flags) { -#ifdef CONFIG_PPC_MM_SLICES +#ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr0, len, flags, mm_ctx_user_psize(¤t->mm->context), 1); #else diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e2e1fec91c6e..9d099dc75e8b 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -377,7 +377,6 @@ config SPE config PPC_64S_HASH_MMU bool "Hash MMU Support" depends on PPC_BOOK3S_64 - select PPC_MM_SLICES default y help Enable support for the Power ISA Hash style MMU. This is implemented @@ -451,9 +450,6 @@ config PPC_BOOK3E_MMU def_bool y depends on FSL_BOOKE || PPC_BOOK3E -config PPC_MM_SLICES - bool - config PPC_HAVE_PMU_SUPPORT bool -- cgit v1.2.3 From 76a345ed16c63df9b02d3e374e8d5e39471174ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:32 +0200 Subject: powerpc/mm: Use generic_get_unmapped_area() and call it from arch_get_unmapped_area() Use the generic version of arch_get_unmapped_area() which is now available at all time instead of its copy radix__arch_get_unmapped_area() To allow that for PPC64, add arch_get_mmap_base() and arch_get_mmap_end() macros. Instead of setting mm->get_unmapped_area() to either arch_get_unmapped_area() or generic_get_unmapped_area(), always set it to arch_get_unmapped_area() and call generic_get_unmapped_area() from there when radix is enabled. Do the same with radix__arch_get_unmapped_area_topdown() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/393be1fa386446443682fdb74544d733f68ef3bb.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/task_size_64.h | 8 ++ arch/powerpc/mm/mmap.c | 127 ++------------------------------ 2 files changed, 14 insertions(+), 121 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h index 38fdf8041d12..5a709951c901 100644 --- a/arch/powerpc/include/asm/task_size_64.h +++ b/arch/powerpc/include/asm/task_size_64.h @@ -72,4 +72,12 @@ #define STACK_TOP_MAX TASK_SIZE_USER64 #define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64) +#define arch_get_mmap_base(addr, base) \ + (((addr) > DEFAULT_MAP_WINDOW) ? (base) + TASK_SIZE - DEFAULT_MAP_WINDOW : (base)) + +#define arch_get_mmap_end(addr, len, flags) \ + (((addr) > DEFAULT_MAP_WINDOW) || \ + (((flags) & MAP_FIXED) && ((addr) + (len) > DEFAULT_MAP_WINDOW)) ? TASK_SIZE : \ + DEFAULT_MAP_WINDOW) + #endif /* _ASM_POWERPC_TASK_SIZE_64_H */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 9b0d6e395bc0..46781d0103d1 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -81,115 +81,15 @@ static inline unsigned long mmap_base(unsigned long rnd, } #ifdef HAVE_ARCH_UNMAPPED_AREA -#ifdef CONFIG_PPC_RADIX_MMU -/* - * Same function as generic code used only for radix, because we don't need to overload - * the generic one. But we will have to duplicate, because hash select - * HAVE_ARCH_UNMAPPED_AREA - */ -static unsigned long -radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = 0; - info.length = len; - info.low_limit = mm->mmap_base; - info.high_limit = high_limit; - info.align_mask = 0; - - return vm_unmapped_area(&info); -} - -static unsigned long -radix__arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = 0; - - addr = vm_unmapped_area(&info); - if (!(addr & ~PAGE_MASK)) - return addr; - VM_BUG_ON(addr != -ENOMEM); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -} -#endif - unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + #ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr, len, flags, mm_ctx_user_psize(¤t->mm->context), 0); @@ -204,6 +104,9 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long pgoff, const unsigned long flags) { + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + #ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr0, len, flags, mm_ctx_user_psize(¤t->mm->context), 1); @@ -213,21 +116,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, } #endif /* HAVE_ARCH_UNMAPPED_AREA */ -static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack) -{ -#ifdef CONFIG_PPC_RADIX_MMU - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = radix__arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; - } -#endif -} - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -239,9 +127,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor, - rlim_stack); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: -- cgit v1.2.3 From 1a0261fd3b218b6999f38dc791a66c9b7ddc7e8b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:33 +0200 Subject: powerpc/mm: Use generic_hugetlb_get_unmapped_area() Use the generic version of arch_hugetlb_get_unmapped_area() which is now available at all time. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/05f77014c619061638ecc52a0a4136eb04cc2799.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 -- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 55 ---------------------------- arch/powerpc/mm/hugetlbpage.c | 4 +- 3 files changed, 1 insertion(+), 62 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 12e150e615b7..b37a28f62cf6 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -8,10 +8,6 @@ */ void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 23d3e08911d3..d2fb776febb4 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -41,61 +41,6 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); } -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - /* - * We are always doing an topdown search here. Slice code - * does that too. - */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); -} - void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f71ac14018e2..a87c886042e9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -553,11 +553,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { -#ifdef CONFIG_PPC_RADIX_MMU if (radix_enabled()) - return radix__hugetlb_get_unmapped_area(file, addr, len, + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); -#endif #ifdef CONFIG_PPC_64S_HASH_MMU return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); #endif -- cgit v1.2.3 From ab57bd7570d4393beb5a91bf092ed54e9c3574a2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:34 +0200 Subject: powerpc/mm: Move get_unmapped_area functions to slice.c hugetlb_get_unmapped_area() is now identical to the generic version if only RADIX is enabled, so move it to slice.c and let it fallback on the generic one when HASH MMU is not compiled in. Do the same with arch_get_unmapped_area() and arch_get_unmapped_area_topdown(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b5d9c124e82889e0cb115c150915a0c0d84eb960.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/mmu.h | 6 ----- arch/powerpc/include/asm/book3s/64/slice.h | 6 +++++ arch/powerpc/mm/book3s64/slice.c | 42 ++++++++++++++++++++++++++++++ arch/powerpc/mm/hugetlbpage.c | 21 --------------- arch/powerpc/mm/mmap.c | 36 ------------------------- 5 files changed, 48 insertions(+), 63 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 006cbec70ffe..570a4960cf17 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -4,12 +4,6 @@ #include -#ifdef CONFIG_HUGETLB_PAGE -#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -#endif -#define HAVE_ARCH_UNMAPPED_AREA -#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN - #ifndef __ASSEMBLY__ /* * Page size definition diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index 5b0f7105bc8b..b8eb4ad271b9 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -4,6 +4,12 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#endif +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + #define SLICE_LOW_SHIFT 28 #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index e4382713746d..03681042b807 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -639,6 +639,32 @@ return_addr: } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + + return slice_get_unmapped_area(addr, len, flags, + mm_ctx_user_psize(¤t->mm->context), 0); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + + return slice_get_unmapped_area(addr0, len, flags, + mm_ctx_user_psize(¤t->mm->context), 1); +} + unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) { unsigned char *psizes; @@ -766,4 +792,20 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } + +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + return shift_to_mmu_psize(huge_page_shift(hstate)); +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + if (radix_enabled()) + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + + return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); +} #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a87c886042e9..b282af39fcf6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -542,27 +542,6 @@ retry: return page; } -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -static inline int file_to_psize(struct file *file) -{ - struct hstate *hstate = hstate_file(file); - return shift_to_mmu_psize(huge_page_shift(hstate)); -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - if (radix_enabled()) - return generic_hugetlb_get_unmapped_area(file, addr, len, - pgoff, flags); -#ifdef CONFIG_PPC_64S_HASH_MMU - return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); -#endif - BUG(); -} -#endif - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 46781d0103d1..5972d619d274 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -80,42 +80,6 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); } -#ifdef HAVE_ARCH_UNMAPPED_AREA -unsigned long arch_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - if (radix_enabled()) - return generic_get_unmapped_area(filp, addr, len, pgoff, flags); - -#ifdef CONFIG_PPC_64S_HASH_MMU - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); -#else - BUG(); -#endif -} - -unsigned long arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - if (radix_enabled()) - return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); - -#ifdef CONFIG_PPC_64S_HASH_MMU - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); -#else - BUG(); -#endif -} -#endif /* HAVE_ARCH_UNMAPPED_AREA */ - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: -- cgit v1.2.3 From 36e5f9ee776cb6db6ab8cb9b056076c4492b9871 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:36 +0200 Subject: powerpc/mm: Convert to default topdown mmap layout Select CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT and remove arch/powerpc/mm/mmap.c This change reuses the generic framework added by commit 67f3977f805b ("arm64, mm: move generic mmap layout functions to mm") without any functional change. Comparison between powerpc implementation and the generic one: - mmap_is_legacy() is identical. - arch_mmap_rnd() does exactly the same allthough it's written slightly differently. - MIN_GAP and MAX_GAP are identical. - mmap_base() does the same but uses STACK_RND_MASK which provides the same values as stack_maxrandom_size(). - arch_pick_mmap_layout() is identical. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/518f9def87d3c889d5958103e7463cf45a2f673d.1649523076.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/processor.h | 2 - arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmap.c | 105 ----------------------------------- 4 files changed, 2 insertions(+), 109 deletions(-) delete mode 100644 arch/powerpc/mm/mmap.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 174edabb74fa..145af02df3dc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -118,7 +118,6 @@ config PPC select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_HUGEPD if HUGETLB_PAGE @@ -154,6 +153,7 @@ config PPC select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 39c25021030f..fdfaae194ddd 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -392,8 +392,6 @@ static inline void prefetchw(const void *x) #define spin_lock_prefetch(x) prefetchw(x) -#define HAVE_ARCH_PICK_MMAP_LAYOUT - /* asm stubs */ extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val); extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val); diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d4c20484dad9..503a6e249940 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,7 +5,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y := fault.o mem.o pgtable.o mmap.o maccess.o pageattr.o \ +obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o \ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c deleted file mode 100644 index d9eae456558a..000000000000 --- a/arch/powerpc/mm/mmap.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * flexible mmap layout support - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * Started by Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Top of mmap area (just below the process stack). - * - * Leave at least a ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long shift, rnd; - - shift = mmap_rnd_bits; -#ifdef CONFIG_COMPAT - if (is_32bit_task()) - shift = mmap_rnd_compat_bits; -#endif - rnd = get_random_long() % (1ul << shift); - - return rnd << PAGE_SHIFT; -} - -static inline unsigned long stack_maxrandom_size(void) -{ - if (!(current->flags & PF_RANDOMIZE)) - return 0; - - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - return (1<<23); - else - return (1<<30); -} - -static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} -- cgit v1.2.3 From f31c618373f2051a32e30002d8eacad7bbbd3885 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 30 Mar 2022 19:37:17 +0530 Subject: powerpc: Sort and de-dup primary opcodes in ppc-opcode.h Some of the primary opcodes are duplicated. Remove those, and sort the rest of the primary opcodes to make it easy to read. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a05edf638a2638d708fc2db0272f6317837b5eab.1648648712.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 69 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 38 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 82f1f0041c6f..a5d89cd3e8d1 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -127,8 +127,37 @@ /* opcode and xopcode for instructions */ -#define OP_TRAP 3 -#define OP_TRAP_64 2 +#define OP_PREFIX 1 +#define OP_TRAP_64 2 +#define OP_TRAP 3 +#define OP_31 31 +#define OP_LWZ 32 +#define OP_LWZU 33 +#define OP_LBZ 34 +#define OP_LBZU 35 +#define OP_STW 36 +#define OP_STWU 37 +#define OP_STB 38 +#define OP_STBU 39 +#define OP_LHZ 40 +#define OP_LHZU 41 +#define OP_LHA 42 +#define OP_LHAU 43 +#define OP_STH 44 +#define OP_STHU 45 +#define OP_LMW 46 +#define OP_STMW 47 +#define OP_LFS 48 +#define OP_LFSU 49 +#define OP_LFD 50 +#define OP_LFDU 51 +#define OP_STFS 52 +#define OP_STFSU 53 +#define OP_STFD 54 +#define OP_STFDU 55 +#define OP_LQ 56 +#define OP_LD 58 +#define OP_STD 62 #define OP_31_XOP_TRAP 4 #define OP_31_XOP_LDX 21 @@ -208,42 +237,6 @@ /* VMX Vector Store Instructions */ #define OP_31_XOP_STVX 231 -/* Prefixed Instructions */ -#define OP_PREFIX 1 - -#define OP_31 31 -#define OP_LWZ 32 -#define OP_STFS 52 -#define OP_STFSU 53 -#define OP_STFD 54 -#define OP_STFDU 55 -#define OP_LD 58 -#define OP_LWZU 33 -#define OP_LBZ 34 -#define OP_LBZU 35 -#define OP_STW 36 -#define OP_STWU 37 -#define OP_STD 62 -#define OP_STB 38 -#define OP_STBU 39 -#define OP_LHZ 40 -#define OP_LHZU 41 -#define OP_LHA 42 -#define OP_LHAU 43 -#define OP_STH 44 -#define OP_STHU 45 -#define OP_LMW 46 -#define OP_STMW 47 -#define OP_LFS 48 -#define OP_LFSU 49 -#define OP_LFD 50 -#define OP_LFDU 51 -#define OP_STFS 52 -#define OP_STFSU 53 -#define OP_STFD 54 -#define OP_STFDU 55 -#define OP_LQ 56 - /* sorted alphabetically */ #define PPC_INST_BCCTR_FLUSH 0x4c400420 #define PPC_INST_COPY 0x7c20060c -- cgit v1.2.3 From 54cdacd7d3b3c1a8dc10965f56c8b5eb8eda1a33 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 30 Mar 2022 19:37:18 +0530 Subject: powerpc: Reject probes on instructions that can't be single stepped Per the ISA, a Trace interrupt is not generated for: - [h|u]rfi[d] - rfscv - sc, scv, and Trap instructions that trap - Power-Saving Mode instructions - other instructions that cause interrupts (other than Trace interrupts) - the first instructions of any interrupt handler (applies to Branch and Single Step tracing; CIABR matches may still occur) - instructions that are emulated by software Add a helper to check for instructions belonging to the first four categories above and to reject kprobes, uprobes and xmon breakpoints on such instructions. We reject probing on instructions belonging to these categories across all ISA versions and across both BookS and BookE. For trap instructions, we can't know in advance if they can cause a trap, and there is no good reason to allow probing on those. Also, uprobes already refuses to probe trap instructions and kprobes does not allow probes on trap instructions used for kernel warnings and bugs. As such, stop allowing any type of probes/breakpoints on trap instruction across uprobes, kprobes and xmon. For some of the fp/altivec instructions that can generate an interrupt and which we emulate in the kernel (altivec assist, for example), we check and turn off single stepping in emulate_single_step(). Instructions generating a DSI are restarted and single stepping normally completes once the instruction is completed. In uprobes, if a single stepped instruction results in a non-fatal signal to be delivered to the task, such signals are "delayed" until after the instruction completes. For fatal signals, single stepping is cancelled and the instruction restarted in-place so that core dump captures proper addresses. In kprobes, we do not allow probes on instructions having an extable entry and we also do not allow probing interrupt vectors. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f56ee979d50b8711fae350fc97870f3ca34acd75.1648648712.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 18 ++++++++++++++++++ arch/powerpc/include/asm/probes.h | 36 +++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/kprobes.c | 4 ++-- arch/powerpc/kernel/uprobes.c | 5 +++++ arch/powerpc/xmon/xmon.c | 11 +++++------ 5 files changed, 66 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index a5d89cd3e8d1..683e9bc618a7 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -130,6 +130,8 @@ #define OP_PREFIX 1 #define OP_TRAP_64 2 #define OP_TRAP 3 +#define OP_SC 17 +#define OP_19 19 #define OP_31 31 #define OP_LWZ 32 #define OP_LWZU 33 @@ -159,6 +161,20 @@ #define OP_LD 58 #define OP_STD 62 +#define OP_19_XOP_RFID 18 +#define OP_19_XOP_RFMCI 38 +#define OP_19_XOP_RFDI 39 +#define OP_19_XOP_RFI 50 +#define OP_19_XOP_RFCI 51 +#define OP_19_XOP_RFSCV 82 +#define OP_19_XOP_HRFID 274 +#define OP_19_XOP_URFID 306 +#define OP_19_XOP_STOP 370 +#define OP_19_XOP_DOZE 402 +#define OP_19_XOP_NAP 434 +#define OP_19_XOP_SLEEP 466 +#define OP_19_XOP_RVWINKLE 498 + #define OP_31_XOP_TRAP 4 #define OP_31_XOP_LDX 21 #define OP_31_XOP_LWZX 23 @@ -179,6 +195,8 @@ #define OP_31_XOP_LHZUX 311 #define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MSGCLRP 174 +#define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_MTMSRD 178 #define OP_31_XOP_TLBIE 306 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h index c5d984700d24..6f66e358aa37 100644 --- a/arch/powerpc/include/asm/probes.h +++ b/arch/powerpc/include/asm/probes.h @@ -8,6 +8,7 @@ * Copyright IBM Corporation, 2012 */ #include +#include typedef u32 ppc_opcode_t; #define BREAKPOINT_INSTRUCTION 0x7fe00008 /* trap */ @@ -31,6 +32,41 @@ typedef u32 ppc_opcode_t; #define MSR_SINGLESTEP (MSR_SE) #endif +static inline bool can_single_step(u32 inst) +{ + switch (get_op(inst)) { + case OP_TRAP_64: return false; + case OP_TRAP: return false; + case OP_SC: return false; + case OP_19: + switch (get_xop(inst)) { + case OP_19_XOP_RFID: return false; + case OP_19_XOP_RFMCI: return false; + case OP_19_XOP_RFDI: return false; + case OP_19_XOP_RFI: return false; + case OP_19_XOP_RFCI: return false; + case OP_19_XOP_RFSCV: return false; + case OP_19_XOP_HRFID: return false; + case OP_19_XOP_URFID: return false; + case OP_19_XOP_STOP: return false; + case OP_19_XOP_DOZE: return false; + case OP_19_XOP_NAP: return false; + case OP_19_XOP_SLEEP: return false; + case OP_19_XOP_RVWINKLE: return false; + } + break; + case OP_31: + switch (get_xop(inst)) { + case OP_31_XOP_TRAP: return false; + case OP_31_XOP_TRAP_64: return false; + case OP_31_XOP_MTMSR: return false; + case OP_31_XOP_MTMSRD: return false; + } + break; + } + return true; +} + /* Enable single stepping for the current task */ static inline void enable_single_step(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 7dae0b01abfb..d5f55e5c8971 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -150,8 +150,8 @@ int arch_prepare_kprobe(struct kprobe *p) if ((unsigned long)p->addr & 0x03) { printk("Attempt to register kprobe at an unaligned address\n"); ret = -EINVAL; - } else if (IS_MTMSRD(insn) || IS_RFID(insn)) { - printk("Cannot register a kprobe on mtmsr[d]/rfi[d]\n"); + } else if (!can_single_step(ppc_inst_val(insn))) { + printk("Cannot register a kprobe on instructions that can't be single stepped\n"); ret = -EINVAL; } else if ((unsigned long)p->addr & ~PAGE_MASK && ppc_inst_prefixed(ppc_inst_read(p->addr - 1))) { diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index c6975467d9ff..95a41ae9dfa7 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -48,6 +48,11 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, return -EINVAL; } + if (!can_single_step(ppc_inst_val(ppc_inst_read(auprobe->insn)))) { + pr_info_ratelimited("Cannot register a uprobe on instructions that can't be single stepped\n"); + return -ENOTSUPP; + } + return 0; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 27da7d5c2024..f93090c033a4 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -921,9 +921,9 @@ static void insert_bpts(void) bp->enabled = 0; continue; } - if (IS_MTMSRD(instr) || IS_RFID(instr)) { - printf("Breakpoint at %lx is on an mtmsrd or rfid " - "instruction, disabling it\n", bp->address); + if (!can_single_step(ppc_inst_val(instr))) { + printf("Breakpoint at %lx is on an instruction that can't be single stepped, disabling it\n", + bp->address); bp->enabled = 0; continue; } @@ -1470,9 +1470,8 @@ static long check_bp_loc(unsigned long addr) printf("Can't read instruction at address %lx\n", addr); return 0; } - if (IS_MTMSRD(instr) || IS_RFID(instr)) { - printf("Breakpoints may not be placed on mtmsrd or rfid " - "instructions\n"); + if (!can_single_step(ppc_inst_val(instr))) { + printf("Breakpoints may not be placed on instructions that can't be single stepped\n"); return 0; } return 1; -- cgit v1.2.3 From f206fdd9d41bf7deb96219b8ca3499a5abd79b83 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 12 Feb 2022 08:36:17 +0100 Subject: powerpc: Reduce csum_add() complexity for PPC64 PPC64 does everything in C, gcc is able to skip calculation when one of the operands in zero. Move the constant folding in PPC32 part. This helps GCC and reduces ppc64_defconfig by 170 bytes. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a4ca63dd4c4b09e1906d08fb814af5a41d0f3fcb.1644651363.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/checksum.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index ab3832b93f0a..8321f6053a67 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -95,16 +95,15 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { #ifdef __powerpc64__ u64 res = (__force u64)csum; -#endif + + res += (__force u64)addend; + return (__force __wsum)((u32)res + (res >> 32)); +#else if (__builtin_constant_p(csum) && csum == 0) return addend; if (__builtin_constant_p(addend) && addend == 0) return csum; -#ifdef __powerpc64__ - res += (__force u64)addend; - return (__force __wsum)((u32)res + (res >> 32)); -#else asm("addc %0,%0,%1;" "addze %0,%0;" : "+r" (csum) : "r" (addend) : "xer"); -- cgit v1.2.3 From 0aa297e73bba35bedadbd2d452c62d643dfd4a32 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 20:20:20 +0100 Subject: powerpc/64: Move pci_device_from_OF_node() out of asm/pci-bridge.h Move pci_device_from_OF_node() in pci64.c because it needs definition of struct device_node and is not worth inlining. ppc32.c already has it in pci32.c. That way pci-bridge.h doesn't need linux/of.h (Brought by asm/prom.h via asm/pci.h) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3c88286b55413730d7784133993a46ef4a3607ce.1646767214.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pci-bridge.h | 14 ++------------ arch/powerpc/kernel/pci_64.c | 9 +++++++++ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 90f488fa4c17..c85f901227c9 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -170,10 +170,10 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) return bus->sysdata; } -#ifndef CONFIG_PPC64 - extern int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn); +#ifndef CONFIG_PPC64 + extern void pci_create_OF_bus_map(void); #else /* CONFIG_PPC64 */ @@ -235,16 +235,6 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev); void remove_sriov_vf_pdns(struct pci_dev *pdev); #endif -static inline int pci_device_from_OF_node(struct device_node *np, - u8 *bus, u8 *devfn) -{ - if (!PCI_DN(np)) - return -ENODEV; - *bus = PCI_DN(np)->busno; - *devfn = PCI_DN(np)->devfn; - return 0; -} - #if defined(CONFIG_EEH) static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn) { diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 3fb7e572abed..81f84e71ab93 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -285,3 +285,12 @@ int pcibus_to_node(struct pci_bus *bus) } EXPORT_SYMBOL(pcibus_to_node); #endif + +int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) +{ + if (!PCI_DN(np)) + return -ENODEV; + *bus = PCI_DN(np)->busno; + *devfn = PCI_DN(np)->devfn; + return 0; +} -- cgit v1.2.3 From 07071346bb76f4fbc2c1ca8894ec3d3ad2f22577 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 20:20:21 +0100 Subject: powerpc: Don't include asm/prom.h in asm/parport.h parport.h needs only of_irq.h, no need to go via asm/prom.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ec796ee56cf61f16ba24e62a9d3525d11931538c.1646767214.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/parport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/parport.h b/arch/powerpc/include/asm/parport.h index 8abfb8f7c33d..42cc321ed754 100644 --- a/arch/powerpc/include/asm/parport.h +++ b/arch/powerpc/include/asm/parport.h @@ -11,7 +11,7 @@ #define _ASM_POWERPC_PARPORT_H #ifdef __KERNEL__ -#include +#include static int parport_pc_find_nonpci_ports (int autoirq, int autodma) { -- cgit v1.2.3 From eb4713c40a619046af99586743d48b9b4435d371 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 20:20:22 +0100 Subject: powerpc: Include asm/reg.h in asm/svm.h is_secure_guest() uses mfmsr(). Don't rely on users to include asm/reg.h, include it in asm/svm.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/482c82c8a29d5fb3ea279b34f107e0e775001344.1646767214.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/svm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index 7546402d796a..da864623cc00 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -10,6 +10,8 @@ #ifdef CONFIG_PPC_SVM +#include + static inline bool is_secure_guest(void) { return mfmsr() & MSR_S; -- cgit v1.2.3 From 669df99c957561dddf580ec269fb4255c41dabc1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 20:20:23 +0100 Subject: powerpc: Add missing declaration in asm/drmem.h Don't rely on random inclusion of linux/of.h by users of asm/drmem.h Add a forward declaration of struct property and struct device_node. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5643ec410e51b749db0636471cb7979524f9ed0e.1646767214.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/drmem.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 4265d5e95c2c..13bf6dee8e2d 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -23,6 +23,9 @@ struct drmem_lmb_info { u64 lmb_size; }; +struct device_node; +struct property; + extern struct drmem_lmb_info *drmem_info; static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb, -- cgit v1.2.3 From 6808b7f5c8255d07d79cb8eac40047f59e4154ad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 4 May 2022 10:20:46 +0300 Subject: termbits: Convert octal defines to hex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many archs have termbits.h as octal numbers. It makes hard for humans to parse the magnitude of large numbers correctly and to compare with hex ones of the same define. Convert octal values to hex. First step is an automated conversion with: for i in $(git ls-files | grep 'termbits\.h'); do awk --non-decimal-data '/^#define\s+[A-Z][A-Z0-9]*\s+0[0-9]/ { l=int(((length($3) - 1) * 3 + 3) / 4); repl = sprintf("0x%0" l "x", $3); print gensub(/[^[:blank:]]+/, repl, 3); next} {print}' $i > $i~; mv $i~ $i; done On top of that, some manual processing on alignment and number of zeros. In addition, small tweaks to formatting of a few comments on the same lines. Signed-off-by: Ilpo Järvinen Acked-by: Arnd Bergmann Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/2c8c96f-a12f-aadc-18ac-34c1d371929c@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/uapi/asm/termbits.h | 202 ++++++++++++++-------------- arch/mips/include/uapi/asm/termbits.h | 222 +++++++++++++++---------------- arch/parisc/include/uapi/asm/termbits.h | 220 +++++++++++++++--------------- arch/powerpc/include/uapi/asm/termbits.h | 202 ++++++++++++++-------------- include/uapi/asm-generic/termbits.h | 220 +++++++++++++++--------------- 5 files changed, 533 insertions(+), 533 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/uapi/asm/termbits.h b/arch/alpha/include/uapi/asm/termbits.h index 4575ba34a0ea..30dc7ff777b8 100644 --- a/arch/alpha/include/uapi/asm/termbits.h +++ b/arch/alpha/include/uapi/asm/termbits.h @@ -72,57 +72,57 @@ struct ktermios { #define VTIME 17 /* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IXON 0001000 -#define IXOFF 0002000 -#define IXANY 0004000 -#define IUCLC 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 +#define IGNBRK 0x00001 +#define BRKINT 0x00002 +#define IGNPAR 0x00004 +#define PARMRK 0x00008 +#define INPCK 0x00010 +#define ISTRIP 0x00020 +#define INLCR 0x00040 +#define IGNCR 0x00080 +#define ICRNL 0x00100 +#define IXON 0x00200 +#define IXOFF 0x00400 +#define IXANY 0x00800 +#define IUCLC 0x01000 +#define IMAXBEL 0x02000 +#define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0000001 -#define ONLCR 0000002 -#define OLCUC 0000004 - -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 - -#define OFILL 00000100 -#define OFDEL 00000200 -#define NLDLY 00001400 -#define NL0 00000000 -#define NL1 00000400 -#define NL2 00001000 -#define NL3 00001400 -#define TABDLY 00006000 -#define TAB0 00000000 -#define TAB1 00002000 -#define TAB2 00004000 -#define TAB3 00006000 -#define CRDLY 00030000 -#define CR0 00000000 -#define CR1 00010000 -#define CR2 00020000 -#define CR3 00030000 -#define FFDLY 00040000 -#define FF0 00000000 -#define FF1 00040000 -#define BSDLY 00100000 -#define BS0 00000000 -#define BS1 00100000 -#define VTDLY 00200000 -#define VT0 00000000 -#define VT1 00200000 +#define OPOST 0x00001 +#define ONLCR 0x00002 +#define OLCUC 0x00004 + +#define OCRNL 0x00008 +#define ONOCR 0x00010 +#define ONLRET 0x00020 + +#define OFILL 0x000040 +#define OFDEL 0x000080 +#define NLDLY 0x000300 +#define NL0 0x000000 +#define NL1 0x000100 +#define NL2 0x000200 +#define NL3 0x000300 +#define TABDLY 0x000c00 +#define TAB0 0x000000 +#define TAB1 0x000400 +#define TAB2 0x000800 +#define TAB3 0x000c00 +#define CRDLY 0x003000 +#define CR0 0x000000 +#define CR1 0x001000 +#define CR2 0x002000 +#define CR3 0x003000 +#define FFDLY 0x004000 +#define FF0 0x000000 +#define FF1 0x004000 +#define BSDLY 0x008000 +#define BS0 0x000000 +#define BS1 0x008000 +#define VTDLY 0x010000 +#define VT0 0x000000 +#define VT1 0x010000 /* * Should be equivalent to TAB3, see description of TAB3 in * POSIX.1-2008, Ch. 11.2.3 "Output Modes" @@ -130,60 +130,60 @@ struct ktermios { #define XTABS TAB3 /* c_cflag bit meaning */ -#define CBAUD 0000037 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 +#define CBAUD 0x0000001f +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f #define EXTA B19200 #define EXTB B38400 -#define CBAUDEX 0000000 -#define B57600 00020 -#define B115200 00021 -#define B230400 00022 -#define B460800 00023 -#define B500000 00024 -#define B576000 00025 -#define B921600 00026 -#define B1000000 00027 -#define B1152000 00030 -#define B1500000 00031 -#define B2000000 00032 -#define B2500000 00033 -#define B3000000 00034 -#define B3500000 00035 -#define B4000000 00036 -#define BOTHER 00037 - -#define CSIZE 00001400 -#define CS5 00000000 -#define CS6 00000400 -#define CS7 00001000 -#define CS8 00001400 - -#define CSTOPB 00002000 -#define CREAD 00004000 -#define PARENB 00010000 -#define PARODD 00020000 -#define HUPCL 00040000 - -#define CLOCAL 00100000 -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define CIBAUD 07600000 +#define CBAUDEX 0x00000000 +#define B57600 0x00000010 +#define B115200 0x00000011 +#define B230400 0x00000012 +#define B460800 0x00000013 +#define B500000 0x00000014 +#define B576000 0x00000015 +#define B921600 0x00000016 +#define B1000000 0x00000017 +#define B1152000 0x00000018 +#define B1500000 0x00000019 +#define B2000000 0x0000001a +#define B2500000 0x0000001b +#define B3000000 0x0000001c +#define B3500000 0x0000001d +#define B4000000 0x0000001e +#define BOTHER 0x0000001f + +#define CSIZE 0x00000300 +#define CS5 0x00000000 +#define CS6 0x00000100 +#define CS7 0x00000200 +#define CS8 0x00000300 + +#define CSTOPB 0x00000400 +#define CREAD 0x00000800 +#define PARENB 0x00001000 +#define PARODD 0x00002000 +#define HUPCL 0x00004000 + +#define CLOCAL 0x00008000 +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ + +#define CIBAUD 0x1f0000 #define IBSHIFT 16 /* c_lflag bits */ diff --git a/arch/mips/include/uapi/asm/termbits.h b/arch/mips/include/uapi/asm/termbits.h index dfeffba729b7..d1315ccdb39a 100644 --- a/arch/mips/include/uapi/asm/termbits.h +++ b/arch/mips/include/uapi/asm/termbits.h @@ -80,131 +80,131 @@ struct ktermios { #define VEOL 17 /* End-of-line character [ICANON]. */ /* c_iflag bits */ -#define IGNBRK 0000001 /* Ignore break condition. */ -#define BRKINT 0000002 /* Signal interrupt on break. */ -#define IGNPAR 0000004 /* Ignore characters with parity errors. */ -#define PARMRK 0000010 /* Mark parity and framing errors. */ -#define INPCK 0000020 /* Enable input parity check. */ -#define ISTRIP 0000040 /* Strip 8th bit off characters. */ -#define INLCR 0000100 /* Map NL to CR on input. */ -#define IGNCR 0000200 /* Ignore CR. */ -#define ICRNL 0000400 /* Map CR to NL on input. */ -#define IUCLC 0001000 /* Map upper case to lower case on input. */ -#define IXON 0002000 /* Enable start/stop output control. */ -#define IXANY 0004000 /* Any character will restart after stop. */ -#define IXOFF 0010000 /* Enable start/stop input control. */ -#define IMAXBEL 0020000 /* Ring bell when input queue is full. */ -#define IUTF8 0040000 /* Input is UTF-8 */ +#define IGNBRK 0x00001 /* Ignore break condition. */ +#define BRKINT 0x00002 /* Signal interrupt on break. */ +#define IGNPAR 0x00004 /* Ignore characters with parity errors. */ +#define PARMRK 0x00008 /* Mark parity and framing errors. */ +#define INPCK 0x00010 /* Enable input parity check. */ +#define ISTRIP 0x00020 /* Strip 8th bit off characters. */ +#define INLCR 0x00040 /* Map NL to CR on input. */ +#define IGNCR 0x00080 /* Ignore CR. */ +#define ICRNL 0x00100 /* Map CR to NL on input. */ +#define IUCLC 0x00200 /* Map upper case to lower case on input. */ +#define IXON 0x00400 /* Enable start/stop output control. */ +#define IXANY 0x00800 /* Any character will restart after stop. */ +#define IXOFF 0x01000 /* Enable start/stop input control. */ +#define IMAXBEL 0x02000 /* Ring bell when input queue is full. */ +#define IUTF8 0x04000 /* Input is UTF-8 */ /* c_oflag bits */ -#define OPOST 0000001 /* Perform output processing. */ -#define OLCUC 0000002 /* Map lower case to upper case on output. */ -#define ONLCR 0000004 /* Map NL to CR-NL on output. */ -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 +#define OPOST 0x00001 /* Perform output processing. */ +#define OLCUC 0x00002 /* Map lower case to upper case on output. */ +#define ONLCR 0x00004 /* Map NL to CR-NL on output. */ +#define OCRNL 0x00008 +#define ONOCR 0x00010 +#define ONLRET 0x00020 +#define OFILL 0x00040 +#define OFDEL 0x00080 +#define NLDLY 0x00100 +#define NL0 0x00000 +#define NL1 0x00100 +#define CRDLY 0x00600 +#define CR0 0x00000 +#define CR1 0x00200 +#define CR2 0x00400 +#define CR3 0x00600 +#define TABDLY 0x01800 +#define TAB0 0x00000 +#define TAB1 0x00800 +#define TAB2 0x01000 +#define TAB3 0x01800 +#define XTABS 0x01800 +#define BSDLY 0x02000 +#define BS0 0x00000 +#define BS1 0x02000 +#define VTDLY 0x04000 +#define VT0 0x00000 +#define VT1 0x04000 +#define FFDLY 0x08000 +#define FF0 0x00000 +#define FF1 0x08000 /* #define PAGEOUT ??? #define WRAP ??? */ /* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 +#define CBAUD 0x0000100f +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f #define EXTA B19200 #define EXTB B38400 -#define CSIZE 0000060 /* Number of bits per byte (mask). */ -#define CS5 0000000 /* 5 bits per byte. */ -#define CS6 0000020 /* 6 bits per byte. */ -#define CS7 0000040 /* 7 bits per byte. */ -#define CS8 0000060 /* 8 bits per byte. */ -#define CSTOPB 0000100 /* Two stop bits instead of one. */ -#define CREAD 0000200 /* Enable receiver. */ -#define PARENB 0000400 /* Parity enable. */ -#define PARODD 0001000 /* Odd parity instead of even. */ -#define HUPCL 0002000 /* Hang up on last close. */ -#define CLOCAL 0004000 /* Ignore modem status lines. */ -#define CBAUDEX 0010000 -#define BOTHER 0010000 -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ +#define CSIZE 0x00000030 /* Number of bits per byte (mask) */ +#define CS5 0x00000000 /* 5 bits per byte */ +#define CS6 0x00000010 /* 6 bits per byte */ +#define CS7 0x00000020 /* 7 bits per byte */ +#define CS8 0x00000030 /* 8 bits per byte */ +#define CSTOPB 0x00000040 /* Two stop bits instead of one */ +#define CREAD 0x00000080 /* Enable receiver */ +#define PARENB 0x00000100 /* Parity enable */ +#define PARODD 0x00000200 /* Odd parity instead of even */ +#define HUPCL 0x00000400 /* Hang up on last close */ +#define CLOCAL 0x00000800 /* Ignore modem status lines */ +#define CBAUDEX 0x00001000 +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f +#define CIBAUD 0x100f0000 /* input baud rate */ +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ #define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ -#define ISIG 0000001 /* Enable signals. */ -#define ICANON 0000002 /* Do erase and kill processing. */ -#define XCASE 0000004 -#define ECHO 0000010 /* Enable echo. */ -#define ECHOE 0000020 /* Visual erase for ERASE. */ -#define ECHOK 0000040 /* Echo NL after KILL. */ -#define ECHONL 0000100 /* Echo NL even if ECHO is off. */ -#define NOFLSH 0000200 /* Disable flush after interrupt. */ -#define IEXTEN 0000400 /* Enable DISCARD and LNEXT. */ -#define ECHOCTL 0001000 /* Echo control characters as ^X. */ -#define ECHOPRT 0002000 /* Hardcopy visual erase. */ -#define ECHOKE 0004000 /* Visual erase for KILL. */ -#define FLUSHO 0020000 -#define PENDIN 0040000 /* Retype pending input (state). */ -#define TOSTOP 0100000 /* Send SIGTTOU for background output. */ -#define ITOSTOP TOSTOP -#define EXTPROC 0200000 /* External processing on pty */ +#define ISIG 0x00001 /* Enable signals. */ +#define ICANON 0x00002 /* Do erase and kill processing. */ +#define XCASE 0x00004 +#define ECHO 0x00008 /* Enable echo. */ +#define ECHOE 0x00010 /* Visual erase for ERASE. */ +#define ECHOK 0x00020 /* Echo NL after KILL. */ +#define ECHONL 0x00040 /* Echo NL even if ECHO is off. */ +#define NOFLSH 0x00080 /* Disable flush after interrupt. */ +#define IEXTEN 0x00100 /* Enable DISCARD and LNEXT. */ +#define ECHOCTL 0x00200 /* Echo control characters as ^X. */ +#define ECHOPRT 0x00400 /* Hardcopy visual erase. */ +#define ECHOKE 0x00800 /* Visual erase for KILL. */ +#define FLUSHO 0x02000 +#define PENDIN 0x04000 /* Retype pending input (state). */ +#define TOSTOP 0x08000 /* Send SIGTTOU for background output. */ +#define ITOSTOP TOSTOP +#define EXTPROC 0x10000 /* External processing on pty */ /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ diff --git a/arch/parisc/include/uapi/asm/termbits.h b/arch/parisc/include/uapi/asm/termbits.h index 40e920f8d683..6017ee08f099 100644 --- a/arch/parisc/include/uapi/asm/termbits.h +++ b/arch/parisc/include/uapi/asm/termbits.h @@ -61,127 +61,127 @@ struct ktermios { /* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0040000 -#define IUTF8 0100000 +#define IGNBRK 0x00001 +#define BRKINT 0x00002 +#define IGNPAR 0x00004 +#define PARMRK 0x00008 +#define INPCK 0x00010 +#define ISTRIP 0x00020 +#define INLCR 0x00040 +#define IGNCR 0x00080 +#define ICRNL 0x00100 +#define IUCLC 0x00200 +#define IXON 0x00400 +#define IXANY 0x00800 +#define IXOFF 0x01000 +#define IMAXBEL 0x04000 +#define IUTF8 0x08000 /* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 +#define OPOST 0x00001 +#define OLCUC 0x00002 +#define ONLCR 0x00004 +#define OCRNL 0x00008 +#define ONOCR 0x00010 +#define ONLRET 0x00020 +#define OFILL 0x00040 +#define OFDEL 0x00080 +#define NLDLY 0x00100 +#define NL0 0x00000 +#define NL1 0x00100 +#define CRDLY 0x00600 +#define CR0 0x00000 +#define CR1 0x00200 +#define CR2 0x00400 +#define CR3 0x00600 +#define TABDLY 0x01800 +#define TAB0 0x00000 +#define TAB1 0x00800 +#define TAB2 0x01000 +#define TAB3 0x01800 +#define XTABS 0x01800 +#define BSDLY 0x02000 +#define BS0 0x00000 +#define BS1 0x02000 +#define VTDLY 0x04000 +#define VT0 0x00000 +#define VT1 0x04000 +#define FFDLY 0x08000 +#define FF0 0x00000 +#define FF1 0x08000 /* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 +#define CBAUD 0x0000100f +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f #define EXTA B19200 #define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ +#define CSIZE 0x00000030 +#define CS5 0x00000000 +#define CS6 0x00000010 +#define CS7 0x00000020 +#define CS8 0x00000030 +#define CSTOPB 0x00000040 +#define CREAD 0x00000080 +#define PARENB 0x00000100 +#define PARODD 0x00000200 +#define HUPCL 0x00000400 +#define CLOCAL 0x00000800 +#define CBAUDEX 0x00001000 +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f +#define CIBAUD 0x100f0000 /* input baud rate */ +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ #define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 -#define EXTPROC 0200000 +#define ISIG 0x00001 +#define ICANON 0x00002 +#define XCASE 0x00004 +#define ECHO 0x00008 +#define ECHOE 0x00010 +#define ECHOK 0x00020 +#define ECHONL 0x00040 +#define NOFLSH 0x00080 +#define TOSTOP 0x00100 +#define ECHOCTL 0x00200 +#define ECHOPRT 0x00400 +#define ECHOKE 0x00800 +#define FLUSHO 0x01000 +#define PENDIN 0x04000 +#define IEXTEN 0x08000 +#define EXTPROC 0x10000 /* tcflow() and TCXONC use these */ #define TCOOFF 0 diff --git a/arch/powerpc/include/uapi/asm/termbits.h b/arch/powerpc/include/uapi/asm/termbits.h index ed18bc61f63d..e4892f2d5592 100644 --- a/arch/powerpc/include/uapi/asm/termbits.h +++ b/arch/powerpc/include/uapi/asm/termbits.h @@ -64,115 +64,115 @@ struct ktermios { #define VDISCARD 16 /* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IXON 0001000 -#define IXOFF 0002000 -#define IXANY 0004000 -#define IUCLC 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 +#define IGNBRK 0x00001 +#define BRKINT 0x00002 +#define IGNPAR 0x00004 +#define PARMRK 0x00008 +#define INPCK 0x00010 +#define ISTRIP 0x00020 +#define INLCR 0x00040 +#define IGNCR 0x00080 +#define ICRNL 0x00100 +#define IXON 0x00200 +#define IXOFF 0x00400 +#define IXANY 0x00800 +#define IUCLC 0x01000 +#define IMAXBEL 0x02000 +#define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0000001 -#define ONLCR 0000002 -#define OLCUC 0000004 - -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 - -#define OFILL 00000100 -#define OFDEL 00000200 -#define NLDLY 00001400 -#define NL0 00000000 -#define NL1 00000400 -#define NL2 00001000 -#define NL3 00001400 -#define TABDLY 00006000 -#define TAB0 00000000 -#define TAB1 00002000 -#define TAB2 00004000 -#define TAB3 00006000 -#define XTABS 00006000 /* required by POSIX to == TAB3 */ -#define CRDLY 00030000 -#define CR0 00000000 -#define CR1 00010000 -#define CR2 00020000 -#define CR3 00030000 -#define FFDLY 00040000 -#define FF0 00000000 -#define FF1 00040000 -#define BSDLY 00100000 -#define BS0 00000000 -#define BS1 00100000 -#define VTDLY 00200000 -#define VT0 00000000 -#define VT1 00200000 +#define OPOST 0x00001 +#define ONLCR 0x00002 +#define OLCUC 0x00004 + +#define OCRNL 0x00008 +#define ONOCR 0x00010 +#define ONLRET 0x00020 + +#define OFILL 0x000040 +#define OFDEL 0x000080 +#define NLDLY 0x000300 +#define NL0 0x000000 +#define NL1 0x000100 +#define NL2 0x000200 +#define NL3 0x000300 +#define TABDLY 0x000c00 +#define TAB0 0x000000 +#define TAB1 0x000400 +#define TAB2 0x000800 +#define TAB3 0x000c00 +#define XTABS 0x000c00 /* required by POSIX to == TAB3 */ +#define CRDLY 0x003000 +#define CR0 0x000000 +#define CR1 0x001000 +#define CR2 0x002000 +#define CR3 0x003000 +#define FFDLY 0x004000 +#define FF0 0x000000 +#define FF1 0x004000 +#define BSDLY 0x008000 +#define BS0 0x000000 +#define BS1 0x008000 +#define VTDLY 0x010000 +#define VT0 0x000000 +#define VT1 0x010000 /* c_cflag bit meaning */ -#define CBAUD 0000377 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 +#define CBAUD 0x000000ff +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f #define EXTA B19200 #define EXTB B38400 -#define CBAUDEX 0000000 -#define B57600 00020 -#define B115200 00021 -#define B230400 00022 -#define B460800 00023 -#define B500000 00024 -#define B576000 00025 -#define B921600 00026 -#define B1000000 00027 -#define B1152000 00030 -#define B1500000 00031 -#define B2000000 00032 -#define B2500000 00033 -#define B3000000 00034 -#define B3500000 00035 -#define B4000000 00036 -#define BOTHER 00037 - -#define CIBAUD 077600000 +#define CBAUDEX 0x00000000 +#define B57600 0x00000010 +#define B115200 0x00000011 +#define B230400 0x00000012 +#define B460800 0x00000013 +#define B500000 0x00000014 +#define B576000 0x00000015 +#define B921600 0x00000016 +#define B1000000 0x00000017 +#define B1152000 0x00000018 +#define B1500000 0x00000019 +#define B2000000 0x0000001a +#define B2500000 0x0000001b +#define B3000000 0x0000001c +#define B3500000 0x0000001d +#define B4000000 0x0000001e +#define BOTHER 0x0000001f + +#define CIBAUD 0x00ff0000 #define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ -#define CSIZE 00001400 -#define CS5 00000000 -#define CS6 00000400 -#define CS7 00001000 -#define CS8 00001400 - -#define CSTOPB 00002000 -#define CREAD 00004000 -#define PARENB 00010000 -#define PARODD 00020000 -#define HUPCL 00040000 - -#define CLOCAL 00100000 -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ +#define CSIZE 0x00000300 +#define CS5 0x00000000 +#define CS6 0x00000100 +#define CS7 0x00000200 +#define CS8 0x00000300 + +#define CSTOPB 0x00000400 +#define CREAD 0x00000800 +#define PARENB 0x00001000 +#define PARODD 0x00002000 +#define HUPCL 0x00004000 + +#define CLOCAL 0x00008000 +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ /* c_lflag bits */ #define ISIG 0x00000080 diff --git a/include/uapi/asm-generic/termbits.h b/include/uapi/asm-generic/termbits.h index 2fbaf9ae89dd..470fd673ff84 100644 --- a/include/uapi/asm-generic/termbits.h +++ b/include/uapi/asm-generic/termbits.h @@ -60,126 +60,126 @@ struct ktermios { #define VEOL2 16 /* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 +#define IGNBRK 0x00001 +#define BRKINT 0x00002 +#define IGNPAR 0x00004 +#define PARMRK 0x00008 +#define INPCK 0x00010 +#define ISTRIP 0x00020 +#define INLCR 0x00040 +#define IGNCR 0x00080 +#define ICRNL 0x00100 +#define IUCLC 0x00200 +#define IXON 0x00400 +#define IXANY 0x00800 +#define IXOFF 0x01000 +#define IMAXBEL 0x02000 +#define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 +#define OPOST 0x00001 +#define OLCUC 0x00002 +#define ONLCR 0x00004 +#define OCRNL 0x00008 +#define ONOCR 0x00010 +#define ONLRET 0x00020 +#define OFILL 0x00040 +#define OFDEL 0x00080 +#define NLDLY 0x00100 +#define NL0 0x00000 +#define NL1 0x00100 +#define CRDLY 0x00600 +#define CR0 0x00000 +#define CR1 0x00200 +#define CR2 0x00400 +#define CR3 0x00600 +#define TABDLY 0x01800 +#define TAB0 0x00000 +#define TAB1 0x00800 +#define TAB2 0x01000 +#define TAB3 0x01800 +#define XTABS 0x01800 +#define BSDLY 0x02000 +#define BS0 0x00000 +#define BS1 0x02000 +#define VTDLY 0x04000 +#define VT0 0x00000 +#define VT1 0x04000 +#define FFDLY 0x08000 +#define FF0 0x00000 +#define FF1 0x08000 /* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 +#define CBAUD 0x0000100f +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f #define EXTA B19200 #define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ +#define CSIZE 0x00000030 +#define CS5 0x00000000 +#define CS6 0x00000010 +#define CS7 0x00000020 +#define CS8 0x00000030 +#define CSTOPB 0x00000040 +#define CREAD 0x00000080 +#define PARENB 0x00000100 +#define PARODD 0x00000200 +#define HUPCL 0x00000400 +#define CLOCAL 0x00000800 +#define CBAUDEX 0x00001000 +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f +#define CIBAUD 0x100f0000 /* input baud rate */ +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ #define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 -#define EXTPROC 0200000 +#define ISIG 0x00001 +#define ICANON 0x00002 +#define XCASE 0x00004 +#define ECHO 0x00008 +#define ECHOE 0x00010 +#define ECHOK 0x00020 +#define ECHONL 0x00040 +#define NOFLSH 0x00080 +#define TOSTOP 0x00100 +#define ECHOCTL 0x00200 +#define ECHOPRT 0x00400 +#define ECHOKE 0x00800 +#define FLUSHO 0x01000 +#define PENDIN 0x04000 +#define IEXTEN 0x08000 +#define EXTPROC 0x10000 /* tcflow() and TCXONC use these */ #define TCOOFF 0 -- cgit v1.2.3 From e6f6390ab7b9d649c13de2c8a591bce61a10ec3b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 20:20:25 +0100 Subject: powerpc: Add missing headers Don't inherit headers "by chances" from asm/prom.h, asm/mpc52xx.h, asm/pci.h etc... Include the needed headers, and remove asm/prom.h when it was needed exclusively for pulling necessary headers. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/be8bdc934d152a7d8ee8d1a840d5596e2f7d85e0.1646767214.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pnv-pci.h | 1 + arch/powerpc/kernel/btext.c | 2 +- arch/powerpc/kernel/cputable.c | 2 +- arch/powerpc/kernel/crash_dump.c | 2 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- arch/powerpc/kernel/eeh_pe.c | 1 + arch/powerpc/kernel/eeh_sysfs.c | 1 + arch/powerpc/kernel/fadump.c | 3 ++- arch/powerpc/kernel/isa-bridge.c | 2 +- arch/powerpc/kernel/legacy_serial.c | 2 +- arch/powerpc/kernel/nvram_64.c | 2 +- arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/kernel/pci-hotplug.c | 1 + arch/powerpc/kernel/pci_64.c | 2 +- arch/powerpc/kernel/pci_dn.c | 2 +- arch/powerpc/kernel/pci_of_scan.c | 2 +- arch/powerpc/kernel/proc_powerpc.c | 2 +- arch/powerpc/kernel/prom_init.c | 2 ++ arch/powerpc/kernel/rtas-proc.c | 2 +- arch/powerpc/kernel/rtas.c | 3 ++- arch/powerpc/kernel/rtas_pci.c | 3 ++- arch/powerpc/kernel/setup-common.c | 3 ++- arch/powerpc/kernel/setup_32.c | 3 ++- arch/powerpc/kernel/setup_64.c | 3 ++- arch/powerpc/kernel/sysfs.c | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/kexec/core_64.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/kvm/powerpc.c | 1 + arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/drmem.c | 2 +- arch/powerpc/perf/imc-pmu.c | 1 + arch/powerpc/platforms/44x/canyonlands.c | 1 + arch/powerpc/platforms/44x/fsp2.c | 2 +- arch/powerpc/platforms/44x/ppc476.c | 2 +- arch/powerpc/platforms/44x/warp.c | 3 ++- arch/powerpc/platforms/4xx/hsta_msi.c | 1 + arch/powerpc/platforms/4xx/pci.c | 1 + arch/powerpc/platforms/4xx/uic.c | 3 ++- arch/powerpc/platforms/512x/mpc5121_ads_cpld.c | 3 ++- arch/powerpc/platforms/512x/mpc512x_shared.c | 2 +- arch/powerpc/platforms/52xx/lite5200_pm.c | 2 ++ arch/powerpc/platforms/52xx/media5200.c | 3 ++- arch/powerpc/platforms/52xx/mpc5200_simple.c | 2 +- arch/powerpc/platforms/52xx/mpc52xx_common.c | 2 +- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 ++ arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 3 ++- arch/powerpc/platforms/52xx/mpc52xx_pci.c | 1 + arch/powerpc/platforms/52xx/mpc52xx_pic.c | 3 ++- arch/powerpc/platforms/52xx/mpc52xx_pm.c | 2 ++ arch/powerpc/platforms/82xx/pq2ads-pci-pic.c | 2 +- arch/powerpc/platforms/83xx/mpc832x_rdb.c | 1 + arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc837x_mds.c | 2 +- arch/powerpc/platforms/83xx/usb.c | 2 +- arch/powerpc/platforms/85xx/ge_imp3a.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_cds.c | 3 ++- arch/powerpc/platforms/85xx/mpc85xx_ds.c | 2 +- arch/powerpc/platforms/85xx/p1022_ds.c | 1 + arch/powerpc/platforms/85xx/p1022_rdk.c | 1 + arch/powerpc/platforms/85xx/p1023_rdb.c | 2 +- arch/powerpc/platforms/85xx/qemu_e500.c | 1 + arch/powerpc/platforms/85xx/xes_mpc85xx.c | 2 +- arch/powerpc/platforms/86xx/gef_ppc9a.c | 2 +- arch/powerpc/platforms/86xx/gef_sbc310.c | 2 +- arch/powerpc/platforms/86xx/gef_sbc610.c | 2 +- arch/powerpc/platforms/86xx/mpc8610_hpcd.c | 3 ++- arch/powerpc/platforms/86xx/mvme7100.c | 1 + arch/powerpc/platforms/8xx/cpm1.c | 2 +- arch/powerpc/platforms/8xx/m8xx_setup.c | 3 ++- arch/powerpc/platforms/8xx/pic.c | 3 ++- arch/powerpc/platforms/amigaone/setup.c | 1 + arch/powerpc/platforms/cell/axon_msi.c | 2 +- arch/powerpc/platforms/cell/cbe_powerbutton.c | 2 +- arch/powerpc/platforms/cell/cbe_regs.c | 2 +- arch/powerpc/platforms/cell/interrupt.c | 3 ++- arch/powerpc/platforms/cell/iommu.c | 2 ++ arch/powerpc/platforms/cell/ras.c | 2 +- arch/powerpc/platforms/cell/spider-pci.c | 1 + arch/powerpc/platforms/cell/spider-pic.c | 3 ++- arch/powerpc/platforms/cell/spu_manage.c | 3 ++- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- arch/powerpc/platforms/chrp/nvram.c | 2 +- arch/powerpc/platforms/chrp/pci.c | 2 +- arch/powerpc/platforms/chrp/setup.c | 4 +++- arch/powerpc/platforms/chrp/time.c | 2 +- arch/powerpc/platforms/embedded6xx/holly.c | 3 ++- arch/powerpc/platforms/embedded6xx/ls_uart.c | 2 +- arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c | 2 +- arch/powerpc/platforms/embedded6xx/mvme5100.c | 2 +- arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c | 3 ++- arch/powerpc/platforms/embedded6xx/wii.c | 2 +- arch/powerpc/platforms/fsl_uli1575.c | 1 + arch/powerpc/platforms/maple/pci.c | 2 +- arch/powerpc/platforms/maple/setup.c | 2 +- arch/powerpc/platforms/maple/time.c | 2 +- arch/powerpc/platforms/pasemi/dma_lib.c | 2 ++ arch/powerpc/platforms/pasemi/iommu.c | 1 + arch/powerpc/platforms/pasemi/misc.c | 1 + arch/powerpc/platforms/pasemi/msi.c | 2 +- arch/powerpc/platforms/pasemi/pci.c | 1 + arch/powerpc/platforms/pasemi/setup.c | 2 +- arch/powerpc/platforms/powermac/bootx_init.c | 1 + arch/powerpc/platforms/powermac/low_i2c.c | 2 +- arch/powerpc/platforms/powermac/nvram.c | 2 +- arch/powerpc/platforms/powermac/pci.c | 3 ++- arch/powerpc/platforms/powermac/pfunc_core.c | 2 +- arch/powerpc/platforms/powermac/pic.c | 4 +++- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/powermac/time.c | 2 +- arch/powerpc/platforms/powermac/udbg_adb.c | 1 + arch/powerpc/platforms/powermac/udbg_scc.c | 1 + arch/powerpc/platforms/powernv/eeh-powernv.c | 1 + arch/powerpc/platforms/powernv/pci-cxl.c | 1 + arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- arch/powerpc/platforms/ps3/setup.c | 2 +- arch/powerpc/platforms/pseries/msi.c | 1 + arch/powerpc/platforms/pseries/nvram.c | 2 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 3 ++- arch/powerpc/platforms/pseries/setup.c | 2 +- arch/powerpc/platforms/pseries/vio.c | 1 + arch/powerpc/sysdev/cpm2_pic.c | 2 +- arch/powerpc/sysdev/dart_iommu.c | 2 +- arch/powerpc/sysdev/dcr.c | 2 +- arch/powerpc/sysdev/fsl_lbc.c | 3 ++- arch/powerpc/sysdev/fsl_msi.c | 4 +++- arch/powerpc/sysdev/fsl_pci.c | 3 ++- arch/powerpc/sysdev/ge/ge_pic.c | 4 +++- arch/powerpc/sysdev/grackle.c | 2 +- arch/powerpc/sysdev/i8259.c | 2 +- arch/powerpc/sysdev/ipic.c | 3 ++- arch/powerpc/sysdev/mmio_nvram.c | 2 +- arch/powerpc/sysdev/mpic.c | 2 ++ arch/powerpc/sysdev/mpic_msgr.c | 3 ++- arch/powerpc/sysdev/mpic_msi.c | 3 ++- arch/powerpc/sysdev/mpic_u3msi.c | 2 +- arch/powerpc/sysdev/msi_bitmap.c | 1 + arch/powerpc/sysdev/pmi.c | 3 ++- arch/powerpc/sysdev/rtc_cmos_setup.c | 2 +- arch/powerpc/sysdev/tsi108_dev.c | 3 ++- arch/powerpc/sysdev/tsi108_pci.c | 3 ++- arch/powerpc/sysdev/xics/icp-native.c | 3 ++- arch/powerpc/sysdev/xics/xics-common.c | 2 +- arch/powerpc/sysdev/xive/common.c | 2 +- arch/powerpc/sysdev/xive/native.c | 2 +- arch/powerpc/sysdev/xive/spapr.c | 2 ++ 146 files changed, 195 insertions(+), 108 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index b3f480799352..8afc92860dbb 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 5ff1b5e805f6..8f69bb07e500 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -10,9 +10,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 2a271a6d6924..7bd6546b9fde 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -12,9 +12,9 @@ #include #include #include +#include #include -#include /* for PTRRELOC on ARCH=ppc */ #include #include #include diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 5693e1c67c2b..c438c60fbc54 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -12,9 +12,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 7d1b2c4a4891..c8e147b66d34 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,7 +20,6 @@ #include #include #include -#include #include diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 845e024321d4..d7a9cf376831 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 429620da73ba..706e1eb95efe 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -6,6 +6,7 @@ * * Send comments and feedback to Linas Vepstas */ +#include #include #include #include diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a4f5bbda9715..bfb671fe904b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -25,9 +25,10 @@ #include #include #include +#include +#include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 39c625737c09..dc746611ebc0 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -18,11 +18,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index cfc03e016ff2..5c58460b269a 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -7,10 +7,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 0d9f9cd41e13..e385d3164648 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -19,9 +19,9 @@ #include #include #include +#include #include #include -#include #include #undef DEBUG_NVRAM diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 9a97a93bd11c..63ed90ba9f0b 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -30,10 +30,10 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 2fc12198ec07..0fe251c6ac2c 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 81f84e71ab93..19b03ddf5631 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -19,10 +19,10 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 61571ae23953..938ab8838ab5 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -12,9 +12,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 6f2b0cc1ddd6..756043dd06e9 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -13,8 +13,8 @@ #include #include +#include #include -#include /** * get_int_prop - Decode a u32 from a device tree property diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index 6a029f2378e1..b109cd7b5d01 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -7,12 +7,12 @@ #include #include #include +#include #include #include #include #include -#include #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ace861ec4c4c..04694ec423f6 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 117886782ebd..081b2b741a8c 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -24,11 +24,11 @@ #include #include #include +#include #include #include #include -#include #include #include /* for ppc_md */ #include diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 1f42aabbbab3..a6bbce998ddf 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -24,9 +24,10 @@ #include #include #include +#include +#include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 781c1869902e..5a2f5ea3b054 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -14,10 +14,11 @@ #include #include #include +#include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 3acf2782acdf..baafad08fb20 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -30,12 +30,13 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index a6e9d36d7c01..813261789303 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -20,9 +20,10 @@ #include #include #include +#include +#include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a96f05063bc9..0e8fc1cd1c55 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -31,11 +31,12 @@ #include #include #include +#include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 2069bbb90a9a..3a10cda9c05e 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -9,12 +9,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e334de41b408..587adcc12860 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -64,7 +65,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index c29c639551fe..c2bea9db1c1e 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,6 @@ #include #include #include /* _end */ -#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6fa518f6501d..5ea107d830a6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 533c4232e5ab..81069e3c3570 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 20662d378fd4..fc92613dc2bf 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 22197b18d85e..2369d1bf2411 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include static int n_root_addr_cells, n_root_size_cells; diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 498f1a2f7658..d7976ab40d38 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -6,6 +6,7 @@ * (C) 2017 Anju T Sudhakar, IBM Corporation. * (C) 2017 Hemant K Shaw, IBM Corporation. */ +#include #include #include #include diff --git a/arch/powerpc/platforms/44x/canyonlands.c b/arch/powerpc/platforms/44x/canyonlands.c index 807968a755ef..5b23aef8bdef 100644 --- a/arch/powerpc/platforms/44x/canyonlands.c +++ b/arch/powerpc/platforms/44x/canyonlands.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "44x.h" diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index af13a59d2f60..e2e4f6d8150d 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -14,11 +14,11 @@ */ #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c index fb7db5cedd4e..20cc8f80b086 100644 --- a/arch/powerpc/platforms/44x/ppc476.c +++ b/arch/powerpc/platforms/44x/ppc476.c @@ -19,11 +19,11 @@ #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index 665f18e37efb..f03432ef010b 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -11,12 +11,13 @@ #include #include #include +#include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/4xx/hsta_msi.c b/arch/powerpc/platforms/4xx/hsta_msi.c index fee430eadcc6..d4f7fff1fc87 100644 --- a/arch/powerpc/platforms/4xx/hsta_msi.c +++ b/arch/powerpc/platforms/4xx/hsta_msi.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/4xx/pci.c b/arch/powerpc/platforms/4xx/pci.c index 24f41e178cbc..ca5dd7a5842a 100644 --- a/arch/powerpc/platforms/4xx/pci.c +++ b/arch/powerpc/platforms/4xx/pci.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/platforms/4xx/uic.c b/arch/powerpc/platforms/4xx/uic.c index 89e2587b1a59..d667ad039bd3 100644 --- a/arch/powerpc/platforms/4xx/uic.c +++ b/arch/powerpc/platforms/4xx/uic.c @@ -19,9 +19,10 @@ #include #include #include +#include +#include #include #include -#include #include #define NR_UIC_INTS 32 diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index ea46870e5d6e..6f08d07aee3b 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include static struct device_node *cpld_pic_node; static struct irq_domain *cpld_pic_host; diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 96d9cf49560d..5ac0ead2540f 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/52xx/lite5200_pm.c b/arch/powerpc/platforms/52xx/lite5200_pm.c index e7da22d1df87..129313b1d021 100644 --- a/arch/powerpc/platforms/52xx/lite5200_pm.c +++ b/arch/powerpc/platforms/52xx/lite5200_pm.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include + #include #include #include diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index 110c444f4bc7..ee367ff3ec8a 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -20,8 +20,9 @@ #include #include #include +#include +#include #include -#include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc5200_simple.c b/arch/powerpc/platforms/52xx/mpc5200_simple.c index b9f5675b0a1d..cc349d579061 100644 --- a/arch/powerpc/platforms/52xx/mpc5200_simple.c +++ b/arch/powerpc/platforms/52xx/mpc5200_simple.c @@ -22,8 +22,8 @@ */ #undef DEBUG +#include #include -#include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc52xx_common.c b/arch/powerpc/platforms/52xx/mpc52xx_common.c index 60aa6015e284..4348506d667d 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_common.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_common.c @@ -15,11 +15,11 @@ #include #include #include +#include #include #include #include #include -#include #include /* MPC5200 device tree match tables */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 03e9bf7c0773..60691b9a248c 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c index 54dfc63809d3..48038aaedbd3 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c @@ -11,11 +11,12 @@ #include #include #include +#include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c index 390a244dd28e..859e2818c43d 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c @@ -13,6 +13,7 @@ #undef DEBUG #include +#include #include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c index 76a8102bdb98..1e0a5e9644dc 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c @@ -101,8 +101,9 @@ #include #include #include +#include +#include #include -#include #include /* HW IRQ mapping */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c b/arch/powerpc/platforms/52xx/mpc52xx_pm.c index b1d208ded981..549b3629e39a 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c @@ -2,6 +2,8 @@ #include #include #include +#include + #include #include #include diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index 285bfe19b798..cf3210042a2e 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include #include #include "pq2.h" diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index b6133a237a70..bb8caa5071f8 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index 0713deffb40c..7dde5a75332b 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc837x_mds.c b/arch/powerpc/platforms/83xx/mpc837x_mds.c index fc88ab97f6e3..fa3538803af7 100644 --- a/arch/powerpc/platforms/83xx/mpc837x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc837x_mds.c @@ -9,12 +9,12 @@ #include #include +#include #include #include #include #include -#include #include #include "mpc83xx.h" diff --git a/arch/powerpc/platforms/83xx/usb.c b/arch/powerpc/platforms/83xx/usb.c index b0bda20aaccf..e2a13a052f96 100644 --- a/arch/powerpc/platforms/83xx/usb.c +++ b/arch/powerpc/platforms/83xx/usb.c @@ -11,9 +11,9 @@ #include #include #include +#include #include -#include #include #include "mpc83xx.h" diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c index 743c65e4d8e4..8e827376d97b 100644 --- a/arch/powerpc/platforms/85xx/ge_imp3a.c +++ b/arch/powerpc/platforms/85xx/ge_imp3a.c @@ -17,13 +17,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index ad0b88ecfd0c..48f3acfece0b 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -33,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index 2157a8017aa4..f8d2c97f39bd 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -15,13 +15,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c index 1f1af0557470..537599906146 100644 --- a/arch/powerpc/platforms/85xx/p1022_ds.c +++ b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c index fd9e3e7ef234..bc58a99164c9 100644 --- a/arch/powerpc/platforms/85xx/p1022_rdk.c +++ b/arch/powerpc/platforms/85xx/p1022_rdk.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/p1023_rdb.c b/arch/powerpc/platforms/85xx/p1023_rdb.c index 3b9cc4979641..c04868eb2eb1 100644 --- a/arch/powerpc/platforms/85xx/p1023_rdb.c +++ b/arch/powerpc/platforms/85xx/p1023_rdb.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include #include #include #include "smp.h" diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c index 4c4d577effd9..64109ad6736c 100644 --- a/arch/powerpc/platforms/85xx/qemu_e500.c +++ b/arch/powerpc/platforms/85xx/qemu_e500.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/xes_mpc85xx.c b/arch/powerpc/platforms/85xx/xes_mpc85xx.c index 397e158c1edb..5836e4ecb7a0 100644 --- a/arch/powerpc/platforms/85xx/xes_mpc85xx.c +++ b/arch/powerpc/platforms/85xx/xes_mpc85xx.c @@ -16,13 +16,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/86xx/gef_ppc9a.c b/arch/powerpc/platforms/86xx/gef_ppc9a.c index 884da08806ce..8e358fa0bc41 100644 --- a/arch/powerpc/platforms/86xx/gef_ppc9a.c +++ b/arch/powerpc/platforms/86xx/gef_ppc9a.c @@ -18,12 +18,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/86xx/gef_sbc310.c b/arch/powerpc/platforms/86xx/gef_sbc310.c index baaf1ab07016..b5b2733567cb 100644 --- a/arch/powerpc/platforms/86xx/gef_sbc310.c +++ b/arch/powerpc/platforms/86xx/gef_sbc310.c @@ -18,12 +18,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c index 120caf6af71d..bb4c8e6b44d0 100644 --- a/arch/powerpc/platforms/86xx/gef_sbc610.c +++ b/arch/powerpc/platforms/86xx/gef_sbc610.c @@ -18,12 +18,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/86xx/mpc8610_hpcd.c b/arch/powerpc/platforms/86xx/mpc8610_hpcd.c index 7733d0607da2..b593b9afd30a 100644 --- a/arch/powerpc/platforms/86xx/mpc8610_hpcd.c +++ b/arch/powerpc/platforms/86xx/mpc8610_hpcd.c @@ -20,12 +20,13 @@ #include #include #include +#include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/86xx/mvme7100.c b/arch/powerpc/platforms/86xx/mvme7100.c index ee983613570c..b2cc32a32d0b 100644 --- a/arch/powerpc/platforms/86xx/mvme7100.c +++ b/arch/powerpc/platforms/86xx/mvme7100.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c index 3ef5e9fd3a9b..a3857df16472 100644 --- a/arch/powerpc/platforms/8xx/cpm1.c +++ b/arch/powerpc/platforms/8xx/cpm1.c @@ -33,12 +33,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index df4d57d07f9a..c6004a0210a0 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -17,10 +17,11 @@ #include #include #include +#include +#include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/8xx/pic.c b/arch/powerpc/platforms/8xx/pic.c index 04a6abf14c29..1fe178cd6810 100644 --- a/arch/powerpc/platforms/8xx/pic.c +++ b/arch/powerpc/platforms/8xx/pic.c @@ -4,7 +4,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 9d252c554f7f..397ce6a40bd0 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -8,6 +8,7 @@ * Copyright 2003 by Hans-Joerg Frieden and Thomas Frieden */ +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 354a58c1e6f2..f3291e957a19 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -13,10 +13,10 @@ #include #include #include +#include #include #include -#include #include "cell.h" diff --git a/arch/powerpc/platforms/cell/cbe_powerbutton.c b/arch/powerpc/platforms/cell/cbe_powerbutton.c index bda589dfb051..a3ee397486f6 100644 --- a/arch/powerpc/platforms/cell/cbe_powerbutton.c +++ b/arch/powerpc/platforms/cell/cbe_powerbutton.c @@ -9,9 +9,9 @@ #include #include +#include #include #include -#include static struct input_dev *button_dev; static struct platform_device *button_pdev; diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c index 03512a41bd7e..316e533afc00 100644 --- a/arch/powerpc/platforms/cell/cbe_regs.c +++ b/arch/powerpc/platforms/cell/cbe_regs.c @@ -10,12 +10,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index 0873a7a20271..03ee8152ee97 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -18,15 +18,16 @@ #include #include +#include #include #include #include #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 3f141cf5e580..0ca3efeef293 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -12,8 +12,10 @@ #include #include #include +#include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 4325c05bedd9..8d934ea6270c 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -12,11 +12,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spider-pci.c b/arch/powerpc/platforms/cell/spider-pci.c index 3a2ea8376e32..e36ebd84f55b 100644 --- a/arch/powerpc/platforms/cell/spider-pci.c +++ b/arch/powerpc/platforms/cell/spider-pci.c @@ -8,6 +8,7 @@ #undef DEBUG #include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index 8af75867cb42..11df737c8c6a 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -10,9 +10,10 @@ #include #include #include +#include +#include #include -#include #include #include "interrupt.h" diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 080ed2d2c682..ae09c5a91b40 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -16,11 +16,12 @@ #include #include #include +#include +#include #include #include #include -#include #include "spufs/spufs.h" #include "interrupt.h" diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 4c702192412f..34334c32b7f5 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -21,10 +21,10 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c index e820332b59a0..dab78076fedb 100644 --- a/arch/powerpc/platforms/chrp/nvram.c +++ b/arch/powerpc/platforms/chrp/nvram.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include "chrp.h" diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c index 76e6256cb0a7..6f6598e771ff 100644 --- a/arch/powerpc/platforms/chrp/pci.c +++ b/arch/powerpc/platforms/chrp/pci.c @@ -9,11 +9,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index 3cfc382841e5..ef4c2b15f9dd 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -32,9 +32,11 @@ #include #include #include +#include +#include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/chrp/time.c b/arch/powerpc/platforms/chrp/time.c index b94dfd5090d8..d46417e3d8e0 100644 --- a/arch/powerpc/platforms/chrp/time.c +++ b/arch/powerpc/platforms/chrp/time.c @@ -21,10 +21,10 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 07e71ba3e846..78f2378d9223 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -22,12 +22,13 @@ #include #include #include +#include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/embedded6xx/ls_uart.c b/arch/powerpc/platforms/embedded6xx/ls_uart.c index 9d891bd5df5a..0133e175a0fc 100644 --- a/arch/powerpc/platforms/embedded6xx/ls_uart.c +++ b/arch/powerpc/platforms/embedded6xx/ls_uart.c @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include "mpc10x.h" diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index 9eb9abb5bce2..8b2b42210356 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -27,10 +27,10 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/embedded6xx/mvme5100.c b/arch/powerpc/platforms/embedded6xx/mvme5100.c index c06a0490d157..4854cc592cec 100644 --- a/arch/powerpc/platforms/embedded6xx/mvme5100.c +++ b/arch/powerpc/platforms/embedded6xx/mvme5100.c @@ -12,12 +12,12 @@ * Author: Stephen Chivers */ +#include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c b/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c index 5aea46566233..e02bdabf358c 100644 --- a/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c +++ b/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c @@ -7,10 +7,11 @@ * Copyright (C) 2008,2009 Albert Herranz */ +#include + #include #include -#include #include #include diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index f60ade584bb2..9e03ff8f631c 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -13,13 +13,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/fsl_uli1575.c b/arch/powerpc/platforms/fsl_uli1575.c index 044a20c1fbde..84afae7a2561 100644 --- a/arch/powerpc/platforms/fsl_uli1575.c +++ b/arch/powerpc/platforms/fsl_uli1575.c @@ -10,6 +10,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index 37875e478b3a..b911b31717cc 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -12,10 +12,10 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c index 4e9ad5bf3efb..c26c379e1cc8 100644 --- a/arch/powerpc/platforms/maple/setup.c +++ b/arch/powerpc/platforms/maple/setup.c @@ -36,12 +36,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/maple/time.c b/arch/powerpc/platforms/maple/time.c index 78209bb7629c..823e219ef8ee 100644 --- a/arch/powerpc/platforms/maple/time.c +++ b/arch/powerpc/platforms/maple/time.c @@ -19,9 +19,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 26427311fc72..1be1f18f6f09 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 5be7242fbd86..0a38663d44ed 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/pasemi/misc.c b/arch/powerpc/platforms/pasemi/misc.c index 1bf65d02d3ba..f859ada29074 100644 --- a/arch/powerpc/platforms/pasemi/misc.c +++ b/arch/powerpc/platforms/pasemi/misc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifdef CONFIG_I2C_BOARDINFO diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c index ea1e41451408..dc1846660005 100644 --- a/arch/powerpc/platforms/pasemi/msi.c +++ b/arch/powerpc/platforms/pasemi/msi.c @@ -9,9 +9,9 @@ */ #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index d4b922759d6e..55f0160910bf 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -12,6 +12,7 @@ #include +#include #include #include diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c index f974bfe7fde1..2aef49e04dd4 100644 --- a/arch/powerpc/platforms/pasemi/setup.c +++ b/arch/powerpc/platforms/pasemi/setup.c @@ -18,8 +18,8 @@ #include #include #include +#include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c index 741aa5b89e55..72eb99aba40f 100644 --- a/arch/powerpc/platforms/powermac/bootx_init.c +++ b/arch/powerpc/platforms/powermac/bootx_init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 9aded0188ce8..c1c430c66dc9 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -40,10 +40,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index 1325db55d890..fe2e0249cbc2 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -17,9 +17,9 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index e9abe0f2e7f0..d71359b5331c 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -12,11 +12,12 @@ #include #include #include +#include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c index dabd4b3d797e..22741ddfd5b2 100644 --- a/arch/powerpc/platforms/powermac/pfunc_core.c +++ b/arch/powerpc/platforms/powermac/pfunc_core.c @@ -12,8 +12,8 @@ #include #include #include +#include -#include #include /* Debug */ diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index bb0566633af5..71a3f62d9b5f 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -20,11 +20,13 @@ #include #include #include +#include +#include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index da1efdc30d6c..0798ecea2474 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index 31d6213a6c8f..4c5790aff1b5 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -24,9 +24,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/udbg_adb.c b/arch/powerpc/platforms/powermac/udbg_adb.c index 93e0a8372fbc..b4756defd596 100644 --- a/arch/powerpc/platforms/powermac/udbg_adb.c +++ b/arch/powerpc/platforms/powermac/udbg_adb.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c index 2465bf466380..734df5a32f99 100644 --- a/arch/powerpc/platforms/powermac/udbg_scc.c +++ b/arch/powerpc/platforms/powermac/udbg_scc.c @@ -5,6 +5,7 @@ * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp */ #include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 33f7b959c810..7c913187ab4a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c index 53172862d23b..7e419de71db8 100644 --- a/arch/powerpc/platforms/powernv/pci-cxl.c +++ b/arch/powerpc/platforms/powernv/pci-cxl.c @@ -4,6 +4,7 @@ */ #include +#include #include #include diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b722ac902269..6fbf265a0818 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -21,10 +21,11 @@ #include #include #include +#include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 3de9145c20bc..d7495785fe47 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -13,13 +13,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index fb2919fd6bc0..a3a71d37cb9a 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 69db2eca367f..cbf1720eb4aa 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -13,9 +13,9 @@ #include #include #include +#include #include #include -#include #include /* Max bytes to read/write in one go */ diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 5fb26a18c398..b5853e9fcc3c 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -13,9 +13,10 @@ #include #include #include +#include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 36433bb1560d..a3dab15b0a2f 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index c9f9be4ea26a..00ecac2c205b 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c index 9e86074719a9..cb9ba4ef557a 100644 --- a/arch/powerpc/sysdev/cpm2_pic.c +++ b/arch/powerpc/sysdev/cpm2_pic.c @@ -30,11 +30,11 @@ #include #include #include +#include #include #include #include -#include #include #include "cpm2_pic.h" diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index be6b99b1b352..610ca7074066 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/dcr.c b/arch/powerpc/sysdev/dcr.c index 22991e1128e3..3093f14111e6 100644 --- a/arch/powerpc/sysdev/dcr.c +++ b/arch/powerpc/sysdev/dcr.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #ifdef CONFIG_PPC_DCR_MMIO diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c index 18acfb4e82af..217cea150987 100644 --- a/arch/powerpc/sysdev/fsl_lbc.c +++ b/arch/powerpc/sysdev/fsl_lbc.c @@ -18,13 +18,14 @@ #include #include #include +#include +#include #include #include #include #include #include #include -#include #include static DEFINE_SPINLOCK(fsl_lbc_lock); diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index b3475ae9f236..ef9a5999fa93 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -11,11 +11,13 @@ #include #include #include +#include +#include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index e5e29033929a..1011cfea2e32 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,7 +31,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/ge/ge_pic.c b/arch/powerpc/sysdev/ge/ge_pic.c index 413b375c4d28..a6c424680c37 100644 --- a/arch/powerpc/sysdev/ge/ge_pic.c +++ b/arch/powerpc/sysdev/ge/ge_pic.c @@ -14,12 +14,14 @@ #include #include #include +#include #include +#include +#include #include #include #include -#include #include #include "ge_pic.h" diff --git a/arch/powerpc/sysdev/grackle.c b/arch/powerpc/sysdev/grackle.c index aaba0b809032..fd2f94a884f0 100644 --- a/arch/powerpc/sysdev/grackle.c +++ b/arch/powerpc/sysdev/grackle.c @@ -9,9 +9,9 @@ #include #include #include +#include #include -#include #include #include diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index 3b1ae98e3ce9..06e391485da7 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -6,11 +6,11 @@ #include #include +#include #include #include #include #include -#include static volatile void __iomem *pci_intack; /* RO, gives us the irq vector */ diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 3f10c9fc3b68..5f69e2d50f26 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -18,9 +18,10 @@ #include #include #include +#include +#include #include #include -#include #include #include "ipic.h" diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c index 628f9b759c84..eb48210ef98e 100644 --- a/arch/powerpc/sysdev/mmio_nvram.c +++ b/arch/powerpc/sysdev/mmio_nvram.c @@ -10,12 +10,12 @@ #include #include #include +#include #include #include #include #include -#include static void __iomem *mmio_nvram_start; static long mmio_nvram_len; diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index dbcbaa4c0663..9a9381f102d6 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index a25413826b63..698fefaaa6dd 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -7,12 +7,13 @@ */ #include +#include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/mpic_msi.c b/arch/powerpc/sysdev/mpic_msi.c index 9936c014ac7d..34246c8e01c2 100644 --- a/arch/powerpc/sysdev/mpic_msi.c +++ b/arch/powerpc/sysdev/mpic_msi.c @@ -4,10 +4,11 @@ */ #include +#include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c index 73d129594078..1d8cfdfdf115 100644 --- a/arch/powerpc/sysdev/mpic_u3msi.c +++ b/arch/powerpc/sysdev/mpic_u3msi.c @@ -5,9 +5,9 @@ */ #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index fdd3e17150fc..0b6e37f3ffb8 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/sysdev/pmi.c b/arch/powerpc/sysdev/pmi.c index 9c8744e09a9c..9dabb50c36eb 100644 --- a/arch/powerpc/sysdev/pmi.c +++ b/arch/powerpc/sysdev/pmi.c @@ -17,12 +17,13 @@ #include #include #include +#include #include +#include #include #include #include -#include struct pmi_data { struct list_head handler; diff --git a/arch/powerpc/sysdev/rtc_cmos_setup.c b/arch/powerpc/sysdev/rtc_cmos_setup.c index af0f9beddca9..47cc87bd6a33 100644 --- a/arch/powerpc/sysdev/rtc_cmos_setup.c +++ b/arch/powerpc/sysdev/rtc_cmos_setup.c @@ -14,8 +14,8 @@ #include #include #include +#include -#include static int __init add_rtc(void) { diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index 9e13fb35ed5c..30051397292f 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -16,13 +16,14 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include #undef DEBUG diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index 1070220f15d5..5af4c35ff584 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #include @@ -23,7 +25,6 @@ #include #include #include -#include #undef DEBUG #ifdef DEBUG diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 7d13d2ef5a90..edc17b6b1cc2 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -6,15 +6,16 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index f3fb2a12124c..ce76f9759952 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index bb5bda6b2357..e06d015ad1a0 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 45f72fc715fc..d25d8c692909 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index 29456c255f9f..1669ddba7884 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include -- cgit v1.2.3 From a1ae431705410fc7092790977bffd1b00c63c229 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 9 Mar 2022 08:56:14 +0100 Subject: powerpc: Use rol32() instead of opencoding in csum_fold() rol32(x, 16) will do the rotate using rlwinm. No need to open code using inline assembly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/794337eff7bb803d2c4e67d9eee635390c4c48fe.1646812553.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/checksum.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index 8321f6053a67..4b573a3b7e17 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -38,14 +38,15 @@ extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, */ static inline __sum16 csum_fold(__wsum sum) { - unsigned int tmp; - - /* swap the two 16-bit halves of sum */ - __asm__("rlwinm %0,%1,16,0,31" : "=r" (tmp) : "r" (sum)); - /* if there is a carry from adding the two 16-bit halves, - it will carry from the lower half into the upper half, - giving us the correct sum in the upper half. */ - return (__force __sum16)(~((__force u32)sum + tmp) >> 16); + u32 tmp = (__force u32)sum; + + /* + * swap the two 16-bit halves of sum + * if there is a carry from adding the two 16-bit halves, + * it will carry from the lower half into the upper half, + * giving us the correct sum in the upper half. + */ + return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16); } static inline u32 from64to32(u64 x) -- cgit v1.2.3 From 9290c379d19774d8de6e2b895d756004dbad9ce5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 21 Mar 2022 16:44:18 +0100 Subject: powerpc/8xx: Simplify flush_tlb_kernel_range() In the same spirit as commit 63f501e07a85 ("powerpc/8xx: Simplify TLB handling"), simplify flush_tlb_kernel_range() for 8xx. 8xx cannot be SMP, and has 'tlbie' and 'tlbia' instructions, so an inline version of flush_tlb_kernel_range() for 8xx is worth it. With this page, first leg of change_page_attr() is: 2c: 55 29 00 3c rlwinm r9,r9,0,0,30 30: 91 23 00 00 stw r9,0(r3) 34: 7c 00 22 64 tlbie r4,r0 38: 7c 00 04 ac hwsync 3c: 38 60 00 00 li r3,0 40: 4e 80 00 20 blr Before the patch it was: 30: 55 29 00 3c rlwinm r9,r9,0,0,30 34: 91 2a 00 00 stw r9,0(r10) 38: 94 21 ff f0 stwu r1,-16(r1) 3c: 7c 08 02 a6 mflr r0 40: 38 83 10 00 addi r4,r3,4096 44: 90 01 00 14 stw r0,20(r1) 48: 48 00 00 01 bl 48 48: R_PPC_REL24 flush_tlb_kernel_range 4c: 80 01 00 14 lwz r0,20(r1) 50: 38 60 00 00 li r3,0 54: 7c 08 03 a6 mtlr r0 58: 38 21 00 10 addi r1,r1,16 5c: 4e 80 00 20 blr Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d2610043419ce3e0e53a85386baf2c3625af5cfb.1647877442.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/tlbflush.h | 12 +++++++++++- arch/powerpc/mm/nohash/tlb.c | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/tlbflush.h b/arch/powerpc/include/asm/nohash/tlbflush.h index c08d25e3e626..698935d4f72d 100644 --- a/arch/powerpc/include/asm/nohash/tlbflush.h +++ b/arch/powerpc/include/asm/nohash/tlbflush.h @@ -30,7 +30,6 @@ struct mm_struct; extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); #ifdef CONFIG_PPC_8xx static inline void local_flush_tlb_mm(struct mm_struct *mm) @@ -45,7 +44,18 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned lon { asm volatile ("tlbie %0; sync" : : "r" (vmaddr) : "memory"); } + +static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + start &= PAGE_MASK; + + if (end - start <= PAGE_SIZE) + asm volatile ("tlbie %0; sync" : : "r" (start) : "memory"); + else + asm volatile ("sync; tlbia; isync" : : : "memory"); +} #else +extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void local_flush_tlb_mm(struct mm_struct *mm); extern void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 7e1e7c3dc66a..5e7ccb48b79c 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -358,6 +358,7 @@ void __init early_init_mmu_47x(void) /* * Flush kernel TLB entries in the given range */ +#ifndef CONFIG_PPC_8xx void flush_tlb_kernel_range(unsigned long start, unsigned long end) { #ifdef CONFIG_SMP @@ -370,6 +371,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) #endif } EXPORT_SYMBOL(flush_tlb_kernel_range); +#endif /* * Currently, for range flushing, we just do a full mm flush. This should -- cgit v1.2.3 From 03ac1b71fca171315dbbc3f9318e3cd2a210541e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: powerpc/pgtable: remove _PAGE_BIT_SWAP_TYPE for book3s The swap type is simply stored in bits 0x1f of the swap pte. Let's simplify by just getting rid of _PAGE_BIT_SWAP_TYPE. It's not like that we can simply change it: _PAGE_SWP_SOFT_DIRTY would suddenly fall into _RPAGE_RSV1, which isn't possible and would make the BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY) angry. While at it, make it clearer which bit we're actually using for _PAGE_SWP_SOFT_DIRTY by just using the proper define and introduce and use SWP_TYPE_MASK. Link: https://lkml.kernel.org/r/20220329164329.208407-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 875730d5af40..8e98375d5c4a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -13,7 +13,6 @@ /* * Common bits between hash and Radix page table */ -#define _PAGE_BIT_SWAP_TYPE 0 #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ @@ -751,17 +750,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * Don't have overlapping bits with _PAGE_HPTEFLAGS \ * We filter HPTEFLAGS on set_pte. \ */ \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ } while (0) #define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ - & ((1UL << SWP_TYPE_BITS) - 1)) +#define SWP_TYPE_MASK ((1UL << SWP_TYPE_BITS) - 1) +#define __swp_type(x) ((x).val & SWP_TYPE_MASK) #define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) + (type) | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) /* * swp_entry_t must be independent of pte bits. We build a swp_entry_t from * swap type and offset we get from swap and convert that to pte to find a @@ -774,7 +772,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#define _PAGE_SWP_SOFT_DIRTY _PAGE_NON_IDEMPOTENT #else #define _PAGE_SWP_SOFT_DIRTY 0UL #endif /* CONFIG_MEM_SOFT_DIRTY */ -- cgit v1.2.3 From bff9beaa2e8039d42dd60c27d0f0e2c586a6cb6b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:47 -0700 Subject: powerpc/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE for book3s Right now, the last 5 bits (0x1f) of the swap entry are used for the type and the bit before that (0x20) is used for _PAGE_SWP_SOFT_DIRTY. We cannot use 0x40, as that collides with _RPAGE_RSV1 -- contained in _PAGE_HPTEFLAGS. The next candidate would be _RPAGE_SW3 (0x200) -- which is used for _PAGE_SOFT_DIRTY for !swp ptes. So let's just use _PAGE_SOFT_DIRTY for _PAGE_SWP_SOFT_DIRTY (to make it easier to grasp) and use 0x20 now for _PAGE_SWP_EXCLUSIVE. Link: https://lkml.kernel.org/r/20220329164329.208407-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8e98375d5c4a..eecff2036869 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -752,6 +752,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_EXCLUSIVE); \ } while (0) #define SWP_TYPE_BITS 5 @@ -772,11 +773,13 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY _PAGE_NON_IDEMPOTENT +#define _PAGE_SWP_SOFT_DIRTY _PAGE_SOFT_DIRTY #else #define _PAGE_SWP_SOFT_DIRTY 0UL #endif /* CONFIG_MEM_SOFT_DIRTY */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_NON_IDEMPOTENT + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { @@ -794,6 +797,22 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_EXCLUSIVE)); +} + static inline bool check_pte_access(unsigned long access, unsigned long ptev) { /* -- cgit v1.2.3 From b033767848c4115e486b1a51946de3bee2ac0fa6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Mar 2022 16:40:20 +0100 Subject: powerpc/code-patching: Use jump_label for testing freed initmem Once init is done, initmem is freed forever so no need to test system_state at every call to patch_instruction(). Use jump_label. This reduces by 2% the time needed to activate ftrace on an 8xx. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0aee964721cab7316cffde21a2ca223cee14d373.1647962456.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 2 ++ arch/powerpc/lib/code-patching.c | 5 ++++- arch/powerpc/mm/mem.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 409483b2d0ce..bccc3a538b9f 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -22,6 +22,8 @@ #define BRANCH_SET_LINK 0x1 #define BRANCH_ABSOLUTE 0x2 +DECLARE_STATIC_KEY_FALSE(init_mem_is_free); + bool is_offset_in_branch_range(long offset); bool is_offset_in_cond_branch_range(long offset); int create_branch(ppc_inst_t *instr, const u32 *addr, diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index f970f189875b..1ecd166091c9 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -188,10 +189,12 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) #endif /* CONFIG_STRICT_KERNEL_RWX */ +__ro_after_init DEFINE_STATIC_KEY_FALSE(init_mem_is_free); + int patch_instruction(u32 *addr, ppc_inst_t instr) { /* Make sure we aren't patching a freed init section */ - if (system_state >= SYSTEM_FREEING_INITMEM && init_section_contains(addr, 4)) + if (static_branch_likely(&init_mem_is_free) && init_section_contains(addr, 4)) return 0; return do_patch_instruction(addr, instr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 4d221d033804..177fae5ea0d1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -311,6 +312,7 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); + static_branch_enable(&init_mem_is_free); free_initmem_default(POISON_FREE_INITMEM); } -- cgit v1.2.3 From c01013a2f8ddfbdddfff3e288a936be13948cf5d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 4 Apr 2022 15:19:45 +0900 Subject: powerpc: add asm/stat.h to UAPI compile-test coverage asm/stat.h is currently excluded from the UAPI compile-test for ARCH=powerpc because of the errors like follows: HDRTEST usr/include/asm/stat.h In file included from :32: ./usr/include/asm/stat.h:32:2: error: unknown type name 'ino_t' 32 | ino_t st_ino; | ^~~~~ ./usr/include/asm/stat.h:35:2: error: unknown type name 'mode_t' 35 | mode_t st_mode; | ^~~~~~ ./usr/include/asm/stat.h:40:2: error: unknown type name 'uid_t' 40 | uid_t st_uid; | ^~~~~ ./usr/include/asm/stat.h:41:2: error: unknown type name 'gid_t' 41 | gid_t st_gid; | ^~~~~ The errors can be fixed by prefixing the types with __kernel_. Then, remove the no-header-test entry from user/include/Makefile. Signed-off-by: Masahiro Yamada Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig --- arch/powerpc/include/uapi/asm/stat.h | 10 +++++----- usr/include/Makefile | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/uapi/asm/stat.h b/arch/powerpc/include/uapi/asm/stat.h index 7871055e5e32..a28c9a1201fa 100644 --- a/arch/powerpc/include/uapi/asm/stat.h +++ b/arch/powerpc/include/uapi/asm/stat.h @@ -29,16 +29,16 @@ struct __old_kernel_stat { struct stat { unsigned long st_dev; - ino_t st_ino; + __kernel_ino_t st_ino; #ifdef __powerpc64__ unsigned long st_nlink; - mode_t st_mode; + __kernel_mode_t st_mode; #else - mode_t st_mode; + __kernel_mode_t st_mode; unsigned short st_nlink; #endif - uid_t st_uid; - gid_t st_gid; + __kernel_uid_t st_uid; + __kernel_gid_t st_gid; unsigned long st_rdev; long st_size; unsigned long st_blksize; diff --git a/usr/include/Makefile b/usr/include/Makefile index da280bdcb391..9d9dea32e3a0 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -66,7 +66,6 @@ no-header-test += linux/if_bonding.h endif ifeq ($(SRCARCH),powerpc) -no-header-test += asm/stat.h no-header-test += linux/bpf_perf_event.h endif -- cgit v1.2.3 From 861604614a94a7aabc111e4a18aaf5d56d270e8a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 22 Jan 2022 20:56:39 +1000 Subject: KVM: PPC: Book3S HV: HFSCR[PREFIX] does not exist This facility is controlled by FSCR only. Reserved bits should not be set in the HFSCR register (although it's likely harmless as this position would not be re-used, and the L0 is forgiving here too). Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220122105639.3477407-1-npiggin@gmail.com --- arch/powerpc/include/asm/reg.h | 1 - arch/powerpc/kvm/book3s_hv.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 2835f6363228..1e14324c5190 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -417,7 +417,6 @@ #define FSCR_DSCR __MASK(FSCR_DSCR_LG) #define FSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ -#define HFSCR_PREFIX __MASK(FSCR_PREFIX_LG) #define HFSCR_MSGP __MASK(FSCR_MSGP_LG) #define HFSCR_TAR __MASK(FSCR_TAR_LG) #define HFSCR_EBB __MASK(FSCR_EBB_LG) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6fa518f6501d..e1e0d1582885 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2834,7 +2834,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) * to trap and then we emulate them. */ vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB | - HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX; + HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP; if (cpu_has_feature(CPU_FTR_HVMODE)) { vcpu->arch.hfscr &= mfspr(SPRN_HFSCR); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -- cgit v1.2.3 From 18827eeef022df43c1fdeca0fde00ca09405dff1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 23 Jan 2022 22:00:38 +1000 Subject: KVM: PPC: Remove kvmppc_claim_lpid Removing kvmppc_claim_lpid makes the lpid allocator API a bit simpler to change the underlying implementation in a future patch. The host LPID is always 0, so that can be a detail of the allocator. If the allocator range is restricted, that can reserve LPIDs at the top of the range. This allows kvmppc_claim_lpid to be removed. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220123120043.3586018-2-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_ppc.h | 1 - arch/powerpc/kvm/book3s_64_mmu_hv.c | 14 ++++++-------- arch/powerpc/kvm/e500mc.c | 1 - arch/powerpc/kvm/powerpc.c | 7 +------ 4 files changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 838d4cb460b7..55ff0d88b20a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -877,7 +877,6 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, struct kvm_dirty_tlb *cfg); long kvmppc_alloc_lpid(void); -void kvmppc_claim_lpid(long lpid); void kvmppc_free_lpid(long lpid); void kvmppc_init_lpid(unsigned long nr_lpids); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 0aeb51738ca9..fed4b7652a95 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -256,14 +256,15 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, int kvmppc_mmu_hv_init(void) { - unsigned long host_lpid, rsvd_lpid; + unsigned long rsvd_lpid; if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; - host_lpid = 0; - if (cpu_has_feature(CPU_FTR_HVMODE)) - host_lpid = mfspr(SPRN_LPID); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (WARN_ON(mfspr(SPRN_LPID) != 0)) + return -EINVAL; + } /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) @@ -271,11 +272,8 @@ int kvmppc_mmu_hv_init(void) else rsvd_lpid = LPID_RSVD_POWER7; - kvmppc_init_lpid(rsvd_lpid + 1); - - kvmppc_claim_lpid(host_lpid); /* rsvd_lpid is reserved for use in partition switching */ - kvmppc_claim_lpid(rsvd_lpid); + kvmppc_init_lpid(rsvd_lpid); return 0; } diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index fa0d8dbbe484..2b9ad8984ea1 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -399,7 +399,6 @@ static int __init kvmppc_e500mc_init(void) * allocator. */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - kvmppc_claim_lpid(0); /* host */ r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 875c30c12db0..1a0e3ede891f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2515,12 +2515,6 @@ long kvmppc_alloc_lpid(void) } EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); -void kvmppc_claim_lpid(long lpid) -{ - set_bit(lpid, lpid_inuse); -} -EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); - void kvmppc_free_lpid(long lpid) { clear_bit(lpid, lpid_inuse); @@ -2531,6 +2525,7 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param) { nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); memset(lpid_inuse, 0, sizeof(lpid_inuse)); + set_bit(0, lpid_inuse); /* The host LPID must always be 0 */ } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); -- cgit v1.2.3 From 5d506f159b2b9d0c9bee9bb43ccafb4f291143c2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 23 Jan 2022 22:00:39 +1000 Subject: KVM: PPC: Book3S HV: Update LPID allocator init for POWER9, Nested The LPID allocator init is changed to: - use mmu_lpid_bits rather than hard-coding; - use KVM_MAX_NESTED_GUESTS for nested hypervisors; - not reserve the top LPID on POWER9 and newer CPUs. The reserved LPID is made a POWER7/8-specific detail. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220123120043.3586018-3-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 2 +- arch/powerpc/include/asm/reg.h | 2 -- arch/powerpc/kvm/book3s_64_mmu_hv.c | 29 +++++++++++++++++++++-------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++++++ arch/powerpc/mm/init_64.c | 3 +++ 5 files changed, 33 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index b6d31bff5209..e6bda70b1d93 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -15,7 +15,7 @@ #define XICS_IPI 2 /* interrupt source # for IPIs */ /* LPIDs we support with this build -- runtime limit may be lower */ -#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) +#define KVMPPC_NR_LPIDS (1UL << 12) /* Maximum number of threads per physical core */ #define MAX_SMT_THREADS 8 diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1e14324c5190..1e8b2e04e626 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -473,8 +473,6 @@ #ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ #endif -#define LPID_RSVD_POWER7 0x3ff /* Reserved LPID for partn switching */ -#define LPID_RSVD 0xfff /* Reserved LPID for partn switching */ #define SPRN_HMER 0x150 /* Hypervisor maintenance exception reg */ #define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */ #define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index fed4b7652a95..b19347fa2076 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -256,7 +256,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, int kvmppc_mmu_hv_init(void) { - unsigned long rsvd_lpid; + unsigned long nr_lpids; if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; @@ -264,16 +264,29 @@ int kvmppc_mmu_hv_init(void) if (cpu_has_feature(CPU_FTR_HVMODE)) { if (WARN_ON(mfspr(SPRN_LPID) != 0)) return -EINVAL; + nr_lpids = 1UL << mmu_lpid_bits; + } else { + nr_lpids = KVM_MAX_NESTED_GUESTS; } - /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */ - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - rsvd_lpid = LPID_RSVD; - else - rsvd_lpid = LPID_RSVD_POWER7; + if (nr_lpids > KVMPPC_NR_LPIDS) + nr_lpids = KVMPPC_NR_LPIDS; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + WARN_ON(nr_lpids != 1UL << 12); + else + WARN_ON(nr_lpids != 1UL << 10); + + /* + * Reserve the last implemented LPID use in partition + * switching for POWER7 and POWER8. + */ + nr_lpids -= 1; + } - /* rsvd_lpid is reserved for use in partition switching */ - kvmppc_init_lpid(rsvd_lpid); + kvmppc_init_lpid(nr_lpids); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index d185dee26026..0c552885a032 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -50,6 +50,14 @@ #define STACK_SLOT_UAMOR (SFS-88) #define STACK_SLOT_FSCR (SFS-96) +/* + * Use the last LPID (all implemented LPID bits = 1) for partition switching. + * This is reserved in the LPID allocator. POWER7 only implements 0x3ff, but + * we write 0xfff into the LPID SPR anyway, which seems to work and just + * ignores the top bits. + */ +#define LPID_RSVD 0xfff + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 83c0ee9fbf05..0f2608679067 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -372,6 +372,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, #ifdef CONFIG_PPC_BOOK3S_64 unsigned int mmu_lpid_bits; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(mmu_lpid_bits); +#endif unsigned int mmu_pid_bits; static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); -- cgit v1.2.3 From c0f00a18e2a8c350a9d263aaf9a2c8bc86caa1b0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 23 Jan 2022 22:00:41 +1000 Subject: KVM: PPC: Book3S HV Nested: Change nested guest lookup to use idr This removes the fixed sized kvm->arch.nested_guests array. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220123120043.3586018-5-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_host.h | 3 +- arch/powerpc/kvm/book3s_hv_nested.c | 110 +++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 54 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index faf301d0dec0..29cabb0845f1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -327,8 +327,7 @@ struct kvm_arch { struct list_head uvmem_pfns; struct mutex mmu_setup_lock; /* nests inside vcpu mutexes */ u64 l1_ptcr; - int max_nested_lpid; - struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; + struct idr kvm_nested_guest_idr; /* This array can grow quite large, keep it at the end */ struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 9d373f8963ee..1eff969b095c 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -521,11 +521,6 @@ static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table); } -void kvmhv_vm_nested_init(struct kvm *kvm) -{ - kvm->arch.max_nested_lpid = -1; -} - /* * Handle the H_SET_PARTITION_TABLE hcall. * r4 = guest real address of partition table + log_2(size) - 12 @@ -660,6 +655,35 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) kvmhv_set_nested_ptbl(gp); } +void kvmhv_vm_nested_init(struct kvm *kvm) +{ + idr_init(&kvm->arch.kvm_nested_guest_idr); +} + +static struct kvm_nested_guest *__find_nested(struct kvm *kvm, int lpid) +{ + return idr_find(&kvm->arch.kvm_nested_guest_idr, lpid); +} + +static bool __prealloc_nested(struct kvm *kvm, int lpid) +{ + if (idr_alloc(&kvm->arch.kvm_nested_guest_idr, + NULL, lpid, lpid + 1, GFP_KERNEL) != lpid) + return false; + return true; +} + +static void __add_nested(struct kvm *kvm, int lpid, struct kvm_nested_guest *gp) +{ + if (idr_replace(&kvm->arch.kvm_nested_guest_idr, gp, lpid)) + WARN_ON(1); +} + +static void __remove_nested(struct kvm *kvm, int lpid) +{ + idr_remove(&kvm->arch.kvm_nested_guest_idr, lpid); +} + static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) { struct kvm_nested_guest *gp; @@ -720,13 +744,8 @@ static void kvmhv_remove_nested(struct kvm_nested_guest *gp) long ref; spin_lock(&kvm->mmu_lock); - if (gp == kvm->arch.nested_guests[lpid]) { - kvm->arch.nested_guests[lpid] = NULL; - if (lpid == kvm->arch.max_nested_lpid) { - while (--lpid >= 0 && !kvm->arch.nested_guests[lpid]) - ; - kvm->arch.max_nested_lpid = lpid; - } + if (gp == __find_nested(kvm, lpid)) { + __remove_nested(kvm, lpid); --gp->refcnt; } ref = gp->refcnt; @@ -743,24 +762,22 @@ static void kvmhv_remove_nested(struct kvm_nested_guest *gp) */ void kvmhv_release_all_nested(struct kvm *kvm) { - int i; + int lpid; struct kvm_nested_guest *gp; struct kvm_nested_guest *freelist = NULL; struct kvm_memory_slot *memslot; int srcu_idx, bkt; spin_lock(&kvm->mmu_lock); - for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { - gp = kvm->arch.nested_guests[i]; - if (!gp) - continue; - kvm->arch.nested_guests[i] = NULL; + idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { + __remove_nested(kvm, lpid); if (--gp->refcnt == 0) { gp->next = freelist; freelist = gp; } } - kvm->arch.max_nested_lpid = -1; + idr_destroy(&kvm->arch.kvm_nested_guest_idr); + /* idr is empty and may be reused at this point */ spin_unlock(&kvm->mmu_lock); while ((gp = freelist) != NULL) { freelist = gp->next; @@ -797,7 +814,7 @@ struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, return NULL; spin_lock(&kvm->mmu_lock); - gp = kvm->arch.nested_guests[l1_lpid]; + gp = __find_nested(kvm, l1_lpid); if (gp) ++gp->refcnt; spin_unlock(&kvm->mmu_lock); @@ -808,17 +825,19 @@ struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, newgp = kvmhv_alloc_nested(kvm, l1_lpid); if (!newgp) return NULL; + + if (!__prealloc_nested(kvm, l1_lpid)) { + kvmhv_release_nested(newgp); + return NULL; + } + spin_lock(&kvm->mmu_lock); - if (kvm->arch.nested_guests[l1_lpid]) { - /* someone else beat us to it */ - gp = kvm->arch.nested_guests[l1_lpid]; - } else { - kvm->arch.nested_guests[l1_lpid] = newgp; + gp = __find_nested(kvm, l1_lpid); + if (!gp) { + __add_nested(kvm, l1_lpid, newgp); ++newgp->refcnt; gp = newgp; newgp = NULL; - if (l1_lpid > kvm->arch.max_nested_lpid) - kvm->arch.max_nested_lpid = l1_lpid; } ++gp->refcnt; spin_unlock(&kvm->mmu_lock); @@ -841,20 +860,13 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp) kvmhv_release_nested(gp); } -static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) -{ - if (lpid > kvm->arch.max_nested_lpid) - return NULL; - return kvm->arch.nested_guests[lpid]; -} - pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid, unsigned long ea, unsigned *hshift) { struct kvm_nested_guest *gp; pte_t *pte; - gp = kvmhv_find_nested(kvm, lpid); + gp = __find_nested(kvm, lpid); if (!gp) return NULL; @@ -960,7 +972,7 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, gpa = n_rmap & RMAP_NESTED_GPA_MASK; lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; - gp = kvmhv_find_nested(kvm, lpid); + gp = __find_nested(kvm, lpid); if (!gp) return; @@ -1152,16 +1164,13 @@ static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) { struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *gp; - int i; + int lpid; spin_lock(&kvm->mmu_lock); - for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { - gp = kvm->arch.nested_guests[i]; - if (gp) { - spin_unlock(&kvm->mmu_lock); - kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); - spin_lock(&kvm->mmu_lock); - } + idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { + spin_unlock(&kvm->mmu_lock); + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + spin_lock(&kvm->mmu_lock); } spin_unlock(&kvm->mmu_lock); } @@ -1313,7 +1322,7 @@ long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid, * H_ENTER_NESTED call. Since we can't differentiate this case from * the invalid case, we ignore such flush requests and return success. */ - if (!kvmhv_find_nested(vcpu->kvm, lpid)) + if (!__find_nested(vcpu->kvm, lpid)) return H_SUCCESS; /* @@ -1657,15 +1666,12 @@ long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) { - int ret = -1; + int ret = lpid + 1; spin_lock(&kvm->mmu_lock); - while (++lpid <= kvm->arch.max_nested_lpid) { - if (kvm->arch.nested_guests[lpid]) { - ret = lpid; - break; - } - } + if (!idr_get_next(&kvm->arch.kvm_nested_guest_idr, &ret)) + ret = -1; spin_unlock(&kvm->mmu_lock); + return ret; } -- cgit v1.2.3 From 03a2e65f54b3acae37f0992133d2f4d1d35f4200 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 23 Jan 2022 22:00:42 +1000 Subject: KVM: PPC: Book3S Nested: Use explicit 4096 LPID maximum Rather than tie this to KVMPPC_NR_LPIDS which is becoming more dynamic, fix it to 4096 (12-bits) explicitly for now. kvmhv_get_nested() does not have to check against KVM_MAX_NESTED_GUESTS because the L1 partition table registration hcall already did that, and it checks against the partition table size. This patch also puts all the partition table size calculations into the same form, using 12 for the architected size field shift and 4 for the shift corresponding to the partition table entry size. Reviewed-by: Fabiano Rosas Signed-of-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220123120043.3586018-6-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_host.h | 7 ++++++- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_nested.c | 24 +++++++++++------------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 29cabb0845f1..2909a88acd16 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -36,7 +36,12 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include /* for MAX_SMT_THREADS */ #define KVM_MAX_VCPU_IDS (MAX_SMT_THREADS * KVM_MAX_VCORES) -#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS + +/* + * Limit the nested partition table to 4096 entries (because that's what + * hardware supports). Both guest and host use this value. + */ +#define KVM_MAX_NESTED_GUESTS_SHIFT 12 #else #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b19347fa2076..a2254390e74b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -266,7 +266,7 @@ int kvmppc_mmu_hv_init(void) return -EINVAL; nr_lpids = 1UL << mmu_lpid_bits; } else { - nr_lpids = KVM_MAX_NESTED_GUESTS; + nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT; } if (nr_lpids > KVMPPC_NR_LPIDS) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 1eff969b095c..75169e0753ce 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -439,10 +439,11 @@ long kvmhv_nested_init(void) if (!radix_enabled()) return -ENODEV; - /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */ - ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1; - if (ptb_order < 8) - ptb_order = 8; + /* Partition table entry is 1<<4 bytes in size, hence the 4. */ + ptb_order = KVM_MAX_NESTED_GUESTS_SHIFT + 4; + /* Minimum partition table size is 1<<12 bytes */ + if (ptb_order < 12) + ptb_order = 12; pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, GFP_KERNEL); if (!pseries_partition_tb) { @@ -450,7 +451,7 @@ long kvmhv_nested_init(void) return -ENOMEM; } - ptcr = __pa(pseries_partition_tb) | (ptb_order - 8); + ptcr = __pa(pseries_partition_tb) | (ptb_order - 12); rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); if (rc != H_SUCCESS) { pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n", @@ -534,16 +535,14 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) long ret = H_SUCCESS; srcu_idx = srcu_read_lock(&kvm->srcu); - /* - * Limit the partition table to 4096 entries (because that's what - * hardware supports), and check the base address. - */ - if ((ptcr & PRTS_MASK) > 12 - 8 || + /* Check partition size and base address. */ + if ((ptcr & PRTS_MASK) + 12 - 4 > KVM_MAX_NESTED_GUESTS_SHIFT || !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) ret = H_PARAMETER; srcu_read_unlock(&kvm->srcu, srcu_idx); if (ret == H_SUCCESS) kvm->arch.l1_ptcr = ptcr; + return ret; } @@ -639,7 +638,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) ret = -EFAULT; ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); - if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) { + if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) { int srcu_idx = srcu_read_lock(&kvm->srcu); ret = kvm_read_guest(kvm, ptbl_addr, &ptbl_entry, sizeof(ptbl_entry)); @@ -809,8 +808,7 @@ struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, { struct kvm_nested_guest *gp, *newgp; - if (l1_lpid >= KVM_MAX_NESTED_GUESTS || - l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) + if (l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) return NULL; spin_lock(&kvm->mmu_lock); -- cgit v1.2.3 From f104df7d519ff1aa92c7ec87e124c88d4e7574cd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 23 Jan 2022 22:00:43 +1000 Subject: KVM: PPC: Book3S HV: Remove KVMPPC_NR_LPIDS KVMPPC_NR_LPIDS no longer represents any size restriction on the LPID space and can be removed. A CPU with more than 12 LPID bits implemented will now be able to create more than 4095 guests. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220123120043.3586018-7-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 3 --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index e6bda70b1d93..c8882d9b86c2 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -14,9 +14,6 @@ #define XICS_MFRR 0xc #define XICS_IPI 2 /* interrupt source # for IPIs */ -/* LPIDs we support with this build -- runtime limit may be lower */ -#define KVMPPC_NR_LPIDS (1UL << 12) - /* Maximum number of threads per physical core */ #define MAX_SMT_THREADS 8 diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index a2254390e74b..f622a39d64e0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -269,9 +269,6 @@ int kvmppc_mmu_hv_init(void) nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT; } - if (nr_lpids > KVMPPC_NR_LPIDS) - nr_lpids = KVMPPC_NR_LPIDS; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) { /* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) -- cgit v1.2.3 From ad5ace91c55e7bd16813617f67bcb7619d51a295 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 3 Mar 2022 15:33:12 +1000 Subject: KVM: PPC: Book3S HV P9: Move cede logic out of XIVE escalation rearming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the cede abort logic out of xive escalation rearming and into the caller to prepare for handling a similar case with nested guest entry. Signed-off-by: Nicholas Piggin Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220303053315.1056880-4-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 10 ++++++++-- arch/powerpc/kvm/book3s_xive.c | 9 ++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 55ff0d88b20a..2f80191e5437 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -685,7 +685,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu); -extern void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu); +extern bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu); static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { @@ -723,7 +723,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir int level, bool line_status) { return -ENODEV; } static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { } -static inline void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { } +static inline bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { return true; } static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2bf1c23b0171..fecd8e3cc42f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4068,10 +4068,16 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, !(vcpu->arch.shregs.msr & MSR_PR)) { unsigned long req = kvmppc_get_gpr(vcpu, 3); - /* H_CEDE has to be handled now, not later */ + /* H_CEDE has to be handled now */ if (req == H_CEDE) { kvmppc_cede(vcpu); - kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */ + if (!kvmppc_xive_rearm_escalation(vcpu)) { + /* + * Pending escalation so abort + * the cede. + */ + vcpu->arch.ceded = 0; + } kvmppc_set_gpr(vcpu, 3, 0); trap = 0; diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index c0ce5531d9bc..4e7b9f0a21c7 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -179,12 +179,13 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu); -void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) +bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr; + bool ret = true; if (!esc_vaddr) - return; + return ret; /* we are using XIVE with single escalation */ @@ -197,7 +198,7 @@ void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) * we also don't want to set xive_esc_on to 1 here in * case we race with xive_esc_irq(). */ - vcpu->arch.ceded = 0; + ret = false; /* * The escalation interrupts are special as we don't EOI them. * There is no need to use the load-after-store ordering offset @@ -210,6 +211,8 @@ void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00); } mb(); + + return ret; } EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation); -- cgit v1.2.3 From 408835832158df0357e18e96da7f2d1ed6b80e7f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 23 Apr 2022 21:11:41 +0200 Subject: powerpc: define get_cycles macro for arch-override PowerPC defines a get_cycles() function, but it does not do the usual `#define get_cycles get_cycles` dance, making it impossible for generic code to see if an arch-specific function was defined. While the get_cycles() ifdef is not currently used, the following timekeeping patch in this series will depend on the macro existing (or not existing) when defining random_get_entropy(). Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman Signed-off-by: Jason A. Donenfeld --- arch/powerpc/include/asm/timex.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index fa2e76e4093a..14b4489de52c 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -19,6 +19,7 @@ static inline cycles_t get_cycles(void) { return mftb(); } +#define get_cycles get_cycles #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TIMEX_H */ -- cgit v1.2.3 From ae07562909f3dfcdff40f87e51965728dab50485 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm: change huge_ptep_clear_flush() to return the original pte Patch series "Fix CONT-PTE/PMD size hugetlb issue when unmapping or migrating", v4. presently, migrating a hugetlb page or unmapping a poisoned hugetlb page, we'll use ptep_clear_flush() and set_pte_at() to nuke the page table entry and remap it, and this is incorrect for CONT-PTE or CONT-PMD size hugetlb page, which will cause potential data consistent issue. This patch set will change to use hugetlb related APIs to fix this issue. Note: Mike pointed out the huge_ptep_get() will only return the one specific value, and it would not take into account the dirty or young bits of CONT-PTE/PMDs like the huge_ptep_get_and_clear() [1]. This inconsistent issue is not introduced by this patch set, and this issue will be addressed in another thread [2]. Meanwhile the uffd for hugetlb case [3] pointed out by Gerald also needs another patch to address. [1] https://lore.kernel.org/linux-mm/85bd80b4-b4fd-0d3f-a2e5-149559f2f387@oracle.com/ [2] https://lore.kernel.org/all/cover.1651998586.git.baolin.wang@linux.alibaba.com/ [3] https://lore.kernel.org/linux-mm/20220503120343.6264e126@thinkpad/ This patch (of 3): It is incorrect to use ptep_clear_flush() to nuke a hugetlb page table when unmapping or migrating a hugetlb page, and will change to use huge_ptep_clear_flush() instead in the following patches. So this is a preparation patch, which changes the huge_ptep_clear_flush() to return the original pte to help to nuke a hugetlb page table. [baolin.wang@linux.alibaba.com: fix build in several more architectures] Link: https://lkml.kernel.org/r/0009a4cd-2826-e8be-e671-f050d4f18d5d@linux.alibaba.com [sfr@canb.auug.org.au: fixup] Link: https://lkml.kernel.org/r/20220511181531.7f27a5c1@canb.auug.org.au Link: https://lkml.kernel.org/r/cover.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/20f77ddab90baa249bd24504c413189b82acde69.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/cover.1652147571.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/dcf065868cce35bceaf138613ad27f17bb7c0c19.1652147571.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Stephen Rothwell Acked-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: James Bottomley Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Arnd Bergmann Cc: Gerald Schaefer Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 4 ++-- arch/arm64/mm/hugetlbpage.c | 12 +++++------- arch/ia64/include/asm/hugetlb.h | 5 +++-- arch/mips/include/asm/hugetlb.h | 9 ++++++--- arch/parisc/include/asm/hugetlb.h | 5 +++-- arch/powerpc/include/asm/hugetlb.h | 9 ++++++--- arch/s390/include/asm/hugetlb.h | 6 +++--- arch/sh/include/asm/hugetlb.h | 5 +++-- arch/sparc/include/asm/hugetlb.h | 5 +++-- include/asm-generic/hugetlb.h | 4 ++-- 10 files changed, 36 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1242f71937f8..616b2ca8b23e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -39,8 +39,8 @@ extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -extern void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cbace1c9e137..ca8e65c94359 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -486,19 +486,17 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } -void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) { - ptep_clear_flush(vma, addr, ptep); - return; - } + if (!pte_cont(READ_ONCE(*ptep))) + return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + return get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } static int __init hugetlbpage_init(void) diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 7e46ebde8c0c..026ead47cd53 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -23,9 +23,10 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, #define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #include diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index c2144409c0c4..fd69c8808554 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -43,16 +43,19 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + pte_t pte; + /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); flush_tlb_page(vma, addr); + return pte; } #define __HAVE_ARCH_HUGE_PTE_NONE diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index a69cf9efb0c1..f7f078c2872c 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -28,9 +28,10 @@ static inline int prepare_hugepage_range(struct file *file, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 6a1a1ac5743b..8a5674fd120d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -43,11 +43,14 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte_t pte; + + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); flush_hugetlb_page(vma, addr); + return pte; } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 32c3fd6e878e..f22beda9e6d5 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -50,10 +50,10 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index ae4de7b89210..4d3ba39e681c 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -21,9 +21,10 @@ static inline int prepare_hugepage_range(struct file *file, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } static inline void arch_clear_hugepage_flags(struct page *page) diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 53838a173f62..0a26cca24232 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -21,9 +21,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 896f341f614d..a57d667addd2 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -84,10 +84,10 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - ptep_clear_flush(vma, addr, ptep); + return ptep_clear_flush(vma, addr, ptep); } #endif -- cgit v1.2.3 From cad32d9d42e8e6a659786f8a730b221a9fbee227 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 6 May 2022 15:37:55 +1000 Subject: KVM: PPC: Book3s: Retire H_PUT_TCE/etc real mode handlers LoPAPR defines guest visible IOMMU with hypercalls to use it - H_PUT_TCE/etc. Implemented first on POWER7 where hypercalls would trap in the KVM in the real mode (with MMU off). The problem with the real mode is some memory is not available and some API usage crashed the host but enabling MMU was an expensive operation. The problems with the real mode handlers are: 1. Occasionally these cannot complete the request so the code is copied+modified to work in the virtual mode, very little is shared; 2. The real mode handlers have to be linked into vmlinux to work; 3. An exception in real mode immediately reboots the machine. If the small DMA window is used, the real mode handlers bring better performance. However since POWER8, there has always been a bigger DMA window which VMs use to map the entire VM memory to avoid calling H_PUT_TCE. Such 1:1 mapping happens once and uses H_PUT_TCE_INDIRECT (a bulk version of H_PUT_TCE) which virtual mode handler is even closer to its real mode version. On POWER9 hypercalls trap straight to the virtual mode so the real mode handlers never execute on POWER9 and later CPUs. So with the current use of the DMA windows and MMU improvements in POWER9 and later, there is no point in duplicating the code. The 32bit passed through devices may slow down but we do not have many of these in practice. For example, with this applied, a 1Gbit ethernet adapter still demostrates above 800Mbit/s of actual throughput. This removes the real mode handlers from KVM and related code from the powernv platform. This updates the list of implemented hcalls in KVM-HV as the realmode handlers are removed. This changes ABI - kvmppc_h_get_tce() moves to the KVM module and kvmppc_find_table() is static now. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220506053755.3820702-1-aik@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 6 +- arch/powerpc/include/asm/kvm_ppc.h | 2 - arch/powerpc/include/asm/mmu_context.h | 5 - arch/powerpc/kernel/iommu.c | 4 +- arch/powerpc/kvm/Makefile | 3 - arch/powerpc/kvm/book3s_64_vio.c | 43 ++ arch/powerpc/kvm/book3s_64_vio_hv.c | 672 -------------------------- arch/powerpc/kvm/book3s_hv.c | 6 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 - arch/powerpc/mm/book3s64/iommu_api.c | 68 --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 5 +- arch/powerpc/platforms/powernv/pci-ioda.c | 46 +- arch/powerpc/platforms/powernv/pci.h | 3 +- arch/powerpc/platforms/pseries/iommu.c | 3 +- 14 files changed, 75 insertions(+), 801 deletions(-) delete mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index d7912b66c874..7e29c73e3dd4 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -51,13 +51,11 @@ struct iommu_table_ops { int (*xchg_no_kill)(struct iommu_table *tbl, long index, unsigned long *hpa, - enum dma_data_direction *direction, - bool realmode); + enum dma_data_direction *direction); void (*tce_kill)(struct iommu_table *tbl, unsigned long index, - unsigned long pages, - bool realmode); + unsigned long pages); __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2f80191e5437..87e02c60ad23 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -177,8 +177,6 @@ extern void kvmppc_setup_partition_table(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); -extern struct kvmppc_spapr_tce_table *kvmppc_find_table( - struct kvm *kvm, unsigned long liobn); #define kvmppc_ioba_validate(stt, ioba, npages) \ (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (stt)->size, (ioba), (npages)) ? \ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b8527a74bd4d..3f25bd3e14eb 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -34,15 +34,10 @@ extern void mm_iommu_init(struct mm_struct *mm); extern void mm_iommu_cleanup(struct mm_struct *mm); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); -extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( - struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); -extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa); -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, unsigned int pageshift, unsigned long *size); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 07093b7cdcb9..6e090e87d6e6 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1065,7 +1065,7 @@ extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, long ret; unsigned long size = 0; - ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL)) && !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, @@ -1080,7 +1080,7 @@ void iommu_tce_kill(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { if (tbl->it_ops->tce_kill) - tbl->it_ops->tce_kill(tbl, entry, pages, false); + tbl->it_ops->tce_kill(tbl, entry, pages); } EXPORT_SYMBOL_GPL(iommu_tce_kill); diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 9bdfc8b50899..8e3681a86074 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -37,9 +37,6 @@ kvm-e500mc-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-builtin-objs-$(CONFIG_SPAPR_TCE_IOMMU) := \ - book3s_64_vio_hv.o - kvm-pr-y := \ fpu.o \ emulate.o \ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 85cfa6328222..d6589c4fe889 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -32,6 +32,18 @@ #include #include +static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, + unsigned long liobn) +{ + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} + static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; @@ -753,3 +765,34 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvmppc_spapr_tce_table *stt; + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; + + stt = kvmppc_find_table(vcpu->kvm, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = (ioba >> stt->page_shift) - stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + if (!page) { + vcpu->arch.regs.gpr[4] = 0; + return H_SUCCESS; + } + tbl = (u64 *)page_address(page); + + vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c deleted file mode 100644 index fdeda6a9cff4..000000000000 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ /dev/null @@ -1,672 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Copyright 2010 Paul Mackerras, IBM Corp. - * Copyright 2011 David Gibson, IBM Corporation - * Copyright 2016 Alexey Kardashevskiy, IBM Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_BUG - -#define WARN_ON_ONCE_RM(condition) ({ \ - static bool __section(".data.unlikely") __warned; \ - int __ret_warn_once = !!(condition); \ - \ - if (unlikely(__ret_warn_once && !__warned)) { \ - __warned = true; \ - pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n", \ - __stringify(condition), \ - __func__, __LINE__); \ - dump_stack(); \ - } \ - unlikely(__ret_warn_once); \ -}) - -#else - -#define WARN_ON_ONCE_RM(condition) ({ \ - int __ret_warn_on = !!(condition); \ - unlikely(__ret_warn_on); \ -}) - -#endif - -/* - * Finds a TCE table descriptor by LIOBN. - * - * WARNING: This will be called in real or virtual mode on HV KVM and virtual - * mode on PR KVM - */ -struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, - unsigned long liobn) -{ - struct kvmppc_spapr_tce_table *stt; - - list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) - if (stt->liobn == liobn) - return stt; - - return NULL; -} -EXPORT_SYMBOL_GPL(kvmppc_find_table); - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -static long kvmppc_rm_tce_to_ua(struct kvm *kvm, - unsigned long tce, unsigned long *ua) -{ - unsigned long gfn = tce >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; - - memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); - if (!memslot) - return -EINVAL; - - *ua = __gfn_to_hva_memslot(memslot, gfn) | - (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); - - return 0; -} - -/* - * Validates TCE address. - * At the moment flags and page mask are validated. - * As the host kernel does not access those addresses (just puts them - * to the table and user space is supposed to process them), we can skip - * checking other things (such as TCE is a guest RAM address or the page - * was actually allocated). - */ -static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long tce) -{ - unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); - enum dma_data_direction dir = iommu_tce_direction(tce); - struct kvmppc_spapr_tce_iommu_table *stit; - unsigned long ua = 0; - - /* Allow userspace to poison TCE table */ - if (dir == DMA_NONE) - return H_SUCCESS; - - if (iommu_tce_check_gpa(stt->page_shift, gpa)) - return H_PARAMETER; - - if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua)) - return H_TOO_HARD; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - unsigned long hpa = 0; - struct mm_iommu_table_group_mem_t *mem; - long shift = stit->tbl->it_page_shift; - - mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift); - if (!mem) - return H_TOO_HARD; - - if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa)) - return H_TOO_HARD; - } - - return H_SUCCESS; -} - -/* Note on the use of page_address() in real mode, - * - * It is safe to use page_address() in real mode on ppc64 because - * page_address() is always defined as lowmem_page_address() - * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic - * operation and does not access page struct. - * - * Theoretically page_address() could be defined different - * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL - * would have to be enabled. - * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, - * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only - * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP - * is not expected to be enabled on ppc32, page_address() - * is safe for ppc32 as well. - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM - */ -static u64 *kvmppc_page_address(struct page *page) -{ -#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) -#error TODO: fix to avoid page_address() here -#endif - return (u64 *) page_address(page); -} - -/* - * Handles TCE requests for emulated devices. - * Puts guest TCE values to the table and expects user space to convert them. - * Cannot fail so kvmppc_rm_tce_validate must be called before it. - */ -static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, - unsigned long idx, unsigned long tce) -{ - struct page *page; - u64 *tbl; - - idx -= stt->offset; - page = stt->pages[idx / TCES_PER_PAGE]; - /* - * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is - * being cleared, otherwise it returns H_TOO_HARD and we skip this. - */ - if (!page) { - WARN_ON_ONCE_RM(tce != 0); - return; - } - tbl = kvmppc_page_address(page); - - tbl[idx % TCES_PER_PAGE] = tce; -} - -/* - * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so - * in real mode. - * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is - * allocated or not required (when clearing a tce entry). - */ -static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long ioba, unsigned long npages, bool clearing) -{ - unsigned long i, idx, sttpage, sttpages; - unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); - - if (ret) - return ret; - /* - * clearing==true says kvmppc_rm_tce_put won't be allocating pages - * for empty tces. - */ - if (clearing) - return H_SUCCESS; - - idx = (ioba >> stt->page_shift) - stt->offset; - sttpage = idx / TCES_PER_PAGE; - sttpages = ALIGN(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) / - TCES_PER_PAGE; - for (i = sttpage; i < sttpage + sttpages; ++i) - if (!stt->pages[i]) - return H_TOO_HARD; - - return H_SUCCESS; -} - -static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, - struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) -{ - long ret; - - ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true); - - if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) { - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - /* - * kvmppc_rm_tce_iommu_do_map() updates the UA cache after - * calling this so we still get here a valid UA. - */ - if (pua && *pua) - mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); - } - - return ret; -} - -static void iommu_tce_kill_rm(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - if (tbl->it_ops->tce_kill) - tbl->it_ops->tce_kill(tbl, entry, pages, true); -} - -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, - struct iommu_table *tbl, unsigned long entry) -{ - unsigned long i; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); - - for (i = 0; i < subpages; ++i) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; - - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); - } -} - -static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, - struct iommu_table *tbl, unsigned long entry) -{ - struct mm_iommu_table_group_mem_t *mem = NULL; - const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - - if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); - if (!mem) - return H_TOO_HARD; - - mm_iommu_mapped_dec(mem); - - *pua = cpu_to_be64(0); - - return H_SUCCESS; -} - -static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, - struct iommu_table *tbl, unsigned long entry) -{ - enum dma_data_direction dir = DMA_NONE; - unsigned long hpa = 0; - long ret; - - if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir)) - /* - * real mode xchg can fail if struct page crosses - * a page boundary - */ - return H_TOO_HARD; - - if (dir == DMA_NONE) - return H_SUCCESS; - - ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - if (ret) - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); - - return ret; -} - -static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, - struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl, - unsigned long entry) -{ - unsigned long i, ret = H_SUCCESS; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry * subpages; - - for (i = 0; i < subpages; ++i) { - ret = kvmppc_rm_tce_iommu_do_unmap(kvm, tbl, io_entry + i); - if (ret != H_SUCCESS) - break; - } - - iommu_tce_kill_rm(tbl, io_entry, subpages); - - return ret; -} - -static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry, unsigned long ua, - enum dma_data_direction dir) -{ - long ret; - unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - struct mm_iommu_table_group_mem_t *mem; - - if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); - if (!mem) - return H_TOO_HARD; - - if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, - &hpa))) - return H_TOO_HARD; - - if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) - return H_TOO_HARD; - - ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); - if (ret) { - mm_iommu_mapped_dec(mem); - /* - * real mode xchg can fail if struct page crosses - * a page boundary - */ - return H_TOO_HARD; - } - - if (dir != DMA_NONE) - kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - - *pua = cpu_to_be64(ua); - - return 0; -} - -static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, - struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl, - unsigned long entry, unsigned long ua, - enum dma_data_direction dir) -{ - unsigned long i, pgoff, ret = H_SUCCESS; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry * subpages; - - for (i = 0, pgoff = 0; i < subpages; - ++i, pgoff += IOMMU_PAGE_SIZE(tbl)) { - - ret = kvmppc_rm_tce_iommu_do_map(kvm, tbl, - io_entry + i, ua + pgoff, dir); - if (ret != H_SUCCESS) - break; - } - - iommu_tce_kill_rm(tbl, io_entry, subpages); - - return ret; -} - -long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba, unsigned long tce) -{ - struct kvmppc_spapr_tce_table *stt; - long ret; - struct kvmppc_spapr_tce_iommu_table *stit; - unsigned long entry, ua = 0; - enum dma_data_direction dir; - - /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ - /* liobn, ioba, tce); */ - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); - if (ret != H_SUCCESS) - return ret; - - ret = kvmppc_rm_tce_validate(stt, tce); - if (ret != H_SUCCESS) - return ret; - - dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) - return H_PARAMETER; - - entry = ioba >> stt->page_shift; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - if (dir == DMA_NONE) - ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, stt, - stit->tbl, entry); - else - ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, - stit->tbl, entry, ua, dir); - - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); - return ret; - } - } - - kvmppc_rm_tce_put(stt, entry, tce); - - return H_SUCCESS; -} - -static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq, - unsigned long ua, unsigned long *phpa) -{ - pte_t *ptep, pte; - unsigned shift = 0; - - /* - * Called in real mode with MSR_EE = 0. We are safe here. - * It is ok to do the lookup with arch.pgdir here, because - * we are doing this on secondary cpus and current task there - * is not the hypervisor. Also this is safe against THP in the - * host, because an IPI to primary thread will wait for the secondary - * to exit which will agains result in the below page table walk - * to finish. - */ - /* an rmap lock won't make it safe. because that just ensure hash - * page table entries are removed with rmap lock held. After that - * mmu notifier returns and we go ahead and removing ptes from Qemu page table. - */ - ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, &shift); - if (!ptep) - return -ENXIO; - - pte = READ_ONCE(*ptep); - if (!pte_present(pte)) - return -ENXIO; - - if (!shift) - shift = PAGE_SHIFT; - - /* Avoid handling anything potentially complicated in realmode */ - if (shift > PAGE_SHIFT) - return -EAGAIN; - - if (!pte_young(pte)) - return -EAGAIN; - - *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | - (ua & ~PAGE_MASK); - - return 0; -} - -long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, - unsigned long liobn, unsigned long ioba, - unsigned long tce_list, unsigned long npages) -{ - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; - long i, ret = H_SUCCESS; - unsigned long tces, entry, ua = 0; - unsigned long mmu_seq; - bool prereg = false; - struct kvmppc_spapr_tce_iommu_table *stit; - - /* - * used to check for invalidations in progress - */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - entry = ioba >> stt->page_shift; - /* - * The spec says that the maximum size of the list is 512 TCEs - * so the whole table addressed resides in 4K page - */ - if (npages > 512) - return H_PARAMETER; - - if (tce_list & (SZ_4K - 1)) - return H_PARAMETER; - - ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); - if (ret != H_SUCCESS) - return ret; - - if (mm_iommu_preregistered(vcpu->kvm->mm)) { - /* - * We get here if guest memory was pre-registered which - * is normally VFIO case and gpa->hpa translation does not - * depend on hpt. - */ - struct mm_iommu_table_group_mem_t *mem; - - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); - if (mem) - prereg = mm_iommu_ua_to_hpa_rm(mem, ua, - IOMMU_PAGE_SHIFT_4K, &tces) == 0; - } - - if (!prereg) { - /* - * This is usually a case of a guest with emulated devices only - * when TCE list is not in preregistered memory. - * We do not require memory to be preregistered in this case - * so lock rmap and do __find_linux_pte_or_hugepte(). - */ - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) - return H_TOO_HARD; - - arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); - if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, &tces)) { - ret = H_TOO_HARD; - goto unlock_exit; - } - } - - for (i = 0; i < npages; ++i) { - unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - - ret = kvmppc_rm_tce_validate(stt, tce); - if (ret != H_SUCCESS) - goto unlock_exit; - } - - for (i = 0; i < npages; ++i) { - unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - - ua = 0; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { - ret = H_PARAMETER; - goto unlock_exit; - } - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, - stit->tbl, entry + i, ua, - iommu_tce_direction(tce)); - - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, - entry + i); - goto unlock_exit; - } - } - - kvmppc_rm_tce_put(stt, entry + i, tce); - } - -unlock_exit: - if (!prereg) - arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); - return ret; -} - -long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, - unsigned long liobn, unsigned long ioba, - unsigned long tce_value, unsigned long npages) -{ - struct kvmppc_spapr_tce_table *stt; - long i, ret; - struct kvmppc_spapr_tce_iommu_table *stit; - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); - if (ret != H_SUCCESS) - return ret; - - /* Check permission bits only to allow userspace poison TCE for debug */ - if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) - return H_PARAMETER; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - unsigned long entry = ioba >> stt->page_shift; - - for (i = 0; i < npages; ++i) { - ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, stt, - stit->tbl, entry + i); - - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) - return ret; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); - } - } - - for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) - kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); - - return ret; -} - -/* This can be called in either virtual mode or real mode */ -long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba) -{ - struct kvmppc_spapr_tce_table *stt; - long ret; - unsigned long idx; - struct page *page; - u64 *tbl; - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_ioba_validate(stt, ioba, 1); - if (ret != H_SUCCESS) - return ret; - - idx = (ioba >> stt->page_shift) - stt->offset; - page = stt->pages[idx / TCES_PER_PAGE]; - if (!page) { - vcpu->arch.regs.gpr[4] = 0; - return H_SUCCESS; - } - tbl = (u64 *)page_address(page); - - vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; - - return H_SUCCESS; -} -EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); - -#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c5fd9b4657dd..af3d204678b1 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1326,6 +1326,12 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) case H_CONFER: case H_REGISTER_VPA: case H_SET_MODE: +#ifdef CONFIG_SPAPR_TCE_IOMMU + case H_GET_TCE: + case H_PUT_TCE: + case H_PUT_TCE_INDIRECT: + case H_STUFF_TCE: +#endif case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 0c552885a032..5cfe4398bb32 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1792,13 +1792,8 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table -#ifdef CONFIG_SPAPR_TCE_IOMMU - .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table -#else .long 0 /* 0x1c */ .long 0 /* 0x20 */ -#endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table @@ -1876,13 +1871,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table -#ifdef CONFIG_SPAPR_TCE_IOMMU - .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table -#else .long 0 /* 0x138 */ .long 0 /* 0x13c */ -#endif .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index cd18e94d0843..7fcfba162e0d 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -305,24 +305,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(mm_iommu_lookup); -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { @@ -369,56 +351,6 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, unsigned int pageshift, unsigned long *size) { diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 30551bbd7988..e96324502db0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -145,8 +145,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, #ifdef CONFIG_IOMMU_API int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction, - bool alloc) + unsigned long *hpa, enum dma_data_direction *direction) { u64 proto_tce = iommu_direction_to_tce_perm(*direction); unsigned long newtce = *hpa | proto_tce, oldtce; @@ -164,7 +163,7 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, } if (!ptce) { - ptce = pnv_tce(tbl, false, idx, alloc); + ptce = pnv_tce(tbl, false, idx, true); if (!ptce) return -ENOMEM; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b722ac902269..aef22ee482bc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1267,22 +1267,20 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, return false; } -static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, - bool real_mode) +static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb) { - return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) : - (phb->regs + 0x210); + return phb->regs + 0x210; } static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) + unsigned long index, unsigned long npages) { struct iommu_table_group_link *tgl = list_first_entry_or_null( &tbl->it_group_list, struct iommu_table_group_link, next); struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long start, end, inc; start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); @@ -1297,11 +1295,7 @@ static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl, mb(); /* Ensure above stores are visible */ while (start <= end) { - if (rm) - __raw_rm_writeq_be(start, invalidate); - else - __raw_writeq_be(start, invalidate); - + __raw_writeq_be(start, invalidate); start += inc; } @@ -1320,7 +1314,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, attrs); if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); + pnv_pci_p7ioc_tce_invalidate(tbl, index, npages); return ret; } @@ -1328,10 +1322,9 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, #ifdef CONFIG_IOMMU_API /* Common for IODA1 and IODA2 */ static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction, - bool realmode) + unsigned long *hpa, enum dma_data_direction *direction) { - return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); + return pnv_tce_xchg(tbl, index, hpa, direction); } #endif @@ -1340,7 +1333,7 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, { pnv_tce_free(tbl, index, npages); - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); + pnv_pci_p7ioc_tce_invalidate(tbl, index, npages); } static struct iommu_table_ops pnv_ioda1_iommu_ops = { @@ -1361,18 +1354,18 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); mb(); /* Ensure above stores are visible */ __raw_writeq_be(val, invalidate); } -static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, +static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, unsigned shift, unsigned long index, unsigned long npages) { - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ @@ -1387,10 +1380,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, mb(); while (start <= end) { - if (rm) - __raw_rm_writeq_be(start, invalidate); - else - __raw_writeq_be(start, invalidate); + __raw_writeq_be(start, invalidate); start += inc; } } @@ -1407,7 +1397,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) } static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) + unsigned long index, unsigned long npages) { struct iommu_table_group_link *tgl; @@ -1418,7 +1408,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned int shift = tbl->it_page_shift; if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) - pnv_pci_phb3_tce_invalidate(pe, rm, shift, + pnv_pci_phb3_tce_invalidate(pe, shift, index, npages); else opal_pci_tce_kill(phb->opal_id, @@ -1437,7 +1427,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, attrs); if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); return ret; } @@ -1447,7 +1437,7 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, { pnv_tce_free(tbl, index, npages); - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { @@ -2738,7 +2728,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) if (rc != OPAL_SUCCESS) return; - pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false); + pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size); if (pe->table_group.group) { iommu_group_put(pe->table_group.group); WARN_ON(pe->table_group.group); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 966a9eb64339..f12643958b8d 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -311,8 +311,7 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction, - bool alloc); + unsigned long *hpa, enum dma_data_direction *direction); extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4d991cf840d9..309952a552f7 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -666,8 +666,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction, - bool realmode) + long *tce, enum dma_data_direction *direction) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; -- cgit v1.2.3 From b22af9041927075b82bcaf4b6c7a354688198d47 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 9 May 2022 17:11:50 +1000 Subject: KVM: PPC: Book3s: Remove real mode interrupt controller hcalls handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we have 2 sets of interrupt controller hypercalls handlers for real and virtual modes, this is from POWER8 times when switching MMU on was considered an expensive operation. POWER9 however does not have dependent threads and MMU is enabled for handling hcalls so the XIVE native or XICS-on-XIVE real mode handlers never execute on real P9 and later CPUs. This untemplate the handlers and only keeps the real mode handlers for XICS native (up to POWER8) and remove the rest of dead code. Changes in functions are mechanical except few missing empty lines to make checkpatch.pl happy. The default implemented hcalls list already contains XICS hcalls so no change there. This should not cause any behavioral change. Reported-by: kernel test robot Signed-off-by: Alexey Kardashevskiy Acked-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220509071150.181250-1-aik@ozlabs.ru --- arch/powerpc/include/asm/kvm_ppc.h | 7 - arch/powerpc/kvm/Makefile | 2 +- arch/powerpc/kvm/book3s_hv_builtin.c | 64 ---- arch/powerpc/kvm/book3s_hv_rm_xics.c | 5 + arch/powerpc/kvm/book3s_hv_rm_xive.c | 46 --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +- arch/powerpc/kvm/book3s_xive.c | 638 +++++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_xive.h | 7 - arch/powerpc/kvm/book3s_xive_template.c | 636 ------------------------------- 9 files changed, 632 insertions(+), 785 deletions(-) delete mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c delete mode 100644 arch/powerpc/kvm/book3s_xive_template.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 87e02c60ad23..9f625af3b65b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -787,13 +787,6 @@ long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long dest, unsigned long src); long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, unsigned long slb_v, unsigned int status, bool data); -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); -unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu); -unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server); -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr); -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu); /* diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 8e3681a86074..f17379b0f161 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -73,7 +73,7 @@ kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ - book3s_hv_rm_xics.o book3s_hv_rm_xive.o + book3s_hv_rm_xics.o kvm-book3s_64-builtin-tm-objs-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm_builtin.o diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7e52d0beee77..88a8f6473c4e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -489,70 +489,6 @@ static long kvmppc_read_one_intr(bool *again) return kvmppc_check_passthru(xisr, xirr, again); } -#ifdef CONFIG_KVM_XICS -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_xirr(vcpu); - else - return xics_rm_h_xirr(vcpu); -} - -unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - vcpu->arch.regs.gpr[5] = get_tb(); - if (xics_on_xive()) - return xive_rm_h_xirr(vcpu); - else - return xics_rm_h_xirr(vcpu); -} - -unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_ipoll(vcpu, server); - else - return H_TOO_HARD; -} - -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_ipi(vcpu, server, mfrr); - else - return xics_rm_h_ipi(vcpu, server, mfrr); -} - -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_cppr(vcpu, cppr); - else - return xics_rm_h_cppr(vcpu, cppr); -} - -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_eoi(vcpu, xirr); - else - return xics_rm_h_eoi(vcpu, xirr); -} -#endif /* CONFIG_KVM_XICS */ - void kvmppc_bad_interrupt(struct pt_regs *regs) { /* diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 6e16bd751c84..e165bfa842bf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -479,6 +479,11 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } +unsigned long xics_rm_h_xirr_x(struct kvm_vcpu *vcpu) +{ + vcpu->arch.regs.gpr[5] = get_tb(); + return xics_rm_h_xirr(vcpu); +} unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c deleted file mode 100644 index dd9880731bd6..000000000000 --- a/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "book3s_xive.h" - -/* XXX */ -#include -//#define DBG(fmt...) udbg_printf(fmt) -#define DBG(fmt...) do { } while(0) - -static inline void __iomem *get_tima_phys(void) -{ - return local_paca->kvm_hstate.xive_tima_phys; -} - -#undef XIVE_RUNTIME_CHECKS -#define X_PFX xive_rm_ -#define X_STATIC -#define X_STAT_PFX stat_rm_ -#define __x_tima get_tima_phys() -#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) -#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) -#define __x_writeb __raw_rm_writeb -#define __x_readw __raw_rm_readw -#define __x_readq __raw_rm_readq -#define __x_writeq __raw_rm_writeq - -#include "book3s_xive_template.c" diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5cfe4398bb32..0fc0e68d20d0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1811,11 +1811,11 @@ hcall_real_table: .long 0 /* 0x5c */ .long 0 /* 0x60 */ #ifdef CONFIG_KVM_XICS - .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table + .long DOTSYM(xics_rm_h_eoi) - hcall_real_table + .long DOTSYM(xics_rm_h_cppr) - hcall_real_table + .long DOTSYM(xics_rm_h_ipi) - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long DOTSYM(xics_rm_h_xirr) - hcall_real_table #else .long 0 /* 0x64 - H_EOI */ .long 0 /* 0x68 - H_CPPR */ @@ -1985,7 +1985,7 @@ hcall_real_table: .long 0 /* 0x2f4 */ .long 0 /* 0x2f8 */ #ifdef CONFIG_KVM_XICS - .long DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table + .long DOTSYM(xics_rm_h_xirr_x) - hcall_real_table #else .long 0 /* 0x2fc - H_XIRR_X*/ #endif diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index ee4be73649e5..3d47862bd666 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -30,27 +30,629 @@ #include "book3s_xive.h" - -/* - * Virtual mode variants of the hcalls for use on radix/radix - * with AIL. They require the VCPU's VP to be "pushed" - * - * We still instantiate them here because we use some of the - * generated utility functions as well in this file. - */ -#define XIVE_RUNTIME_CHECKS -#define X_PFX xive_vm_ -#define X_STATIC static -#define X_STAT_PFX stat_vm_ -#define __x_tima xive_tima #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) -#define __x_writeb __raw_writeb -#define __x_readw __raw_readw -#define __x_readq __raw_readq -#define __x_writeq __raw_writeq -#include "book3s_xive_template.c" +/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */ +#define XICS_DUMMY 1 + +static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc) +{ + u8 cppr; + u16 ack; + + /* + * Ensure any previous store to CPPR is ordered vs. + * the subsequent loads from PIPR or ACK. + */ + eieio(); + + /* Perform the acknowledge OS to register cycle. */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* XXX Check grouping level */ + + /* Anything ? */ + if (!((ack >> 8) & TM_QW1_NSR_EO)) + return; + + /* Grab CPPR of the most favored pending interrupt */ + cppr = ack & 0xff; + if (cppr < 8) + xc->pending |= 1 << cppr; + + /* Check consistency */ + if (cppr >= xc->hw_cppr) + pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->hw_cppr); + + /* + * Update our image of the HW CPPR. We don't yet modify + * xc->cppr, this will be done as we scan for interrupts + * in the queues. + */ + xc->hw_cppr = cppr; +} + +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + offset |= XIVE_ESB_LD_ST_MO; + + val = __raw_readq(__x_eoi_page(xd) + offset); +#ifdef __LITTLE_ENDIAN__ + val >>= 64-8; +#endif + return (u8)val; +} + + +static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd) +{ + /* If the XIVE supports the new "store EOI facility, use it */ + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + __raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); + else if (xd->flags & XIVE_IRQ_FLAG_LSI) { + /* + * For LSIs the HW EOI cycle is used rather than PQ bits, + * as they are automatically re-triggred in HW when still + * pending. + */ + __raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); + } else { + uint64_t eoi_val; + + /* + * Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q, + * except for LSIs where we use the "EOI cycle" special + * load. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthetizing an interrupt in software + */ + eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00); + + /* Re-trigger if needed */ + if ((eoi_val & 1) && __x_trig_page(xd)) + __raw_writeq(0, __x_trig_page(xd)); + } +} + +enum { + scan_fetch, + scan_poll, + scan_eoi, +}; + +static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc, + u8 pending, int scan_type) +{ + u32 hirq = 0; + u8 prio = 0xff; + + /* Find highest pending priority */ + while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) { + struct xive_q *q; + u32 idx, toggle; + __be32 *qpage; + + /* + * If pending is 0 this will return 0xff which is what + * we want + */ + prio = ffs(pending) - 1; + + /* Don't scan past the guest cppr */ + if (prio >= xc->cppr || prio > 7) { + if (xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + } + break; + } + + /* Grab queue and pointers */ + q = &xc->queues[prio]; + idx = q->idx; + toggle = q->toggle; + + /* + * Snapshot the queue page. The test further down for EOI + * must use the same "copy" that was used by __xive_read_eq + * since qpage can be set concurrently and we don't want + * to miss an EOI. + */ + qpage = READ_ONCE(q->qpage); + +skip_ipi: + /* + * Try to fetch from the queue. Will return 0 for a + * non-queueing priority (ie, qpage = 0). + */ + hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle); + + /* + * If this was a signal for an MFFR change done by + * H_IPI we skip it. Additionally, if we were fetching + * we EOI it now, thus re-enabling reception of a new + * such signal. + * + * We also need to do that if prio is 0 and we had no + * page for the queue. In this case, we have non-queued + * IPI that needs to be EOId. + * + * This is safe because if we have another pending MFRR + * change that wasn't observed above, the Q bit will have + * been set and another occurrence of the IPI will trigger. + */ + if (hirq == XICS_IPI || (prio == 0 && !qpage)) { + if (scan_type == scan_fetch) { + xive_vm_source_eoi(xc->vp_ipi, + &xc->vp_ipi_data); + q->idx = idx; + q->toggle = toggle; + } + /* Loop back on same queue with updated idx/toggle */ + WARN_ON(hirq && hirq != XICS_IPI); + if (hirq) + goto skip_ipi; + } + + /* If it's the dummy interrupt, continue searching */ + if (hirq == XICS_DUMMY) + goto skip_ipi; + + /* Clear the pending bit if the queue is now empty */ + if (!hirq) { + pending &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. + */ + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* + * If the most favoured prio we found pending is less + * favored (or equal) than a pending IPI, we return + * the IPI instead. + */ + if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + break; + } + + /* If fetching, update queue pointers */ + if (scan_type == scan_fetch) { + q->idx = idx; + q->toggle = toggle; + } + } + + /* If we are just taking a "peek", do nothing else */ + if (scan_type == scan_poll) + return hirq; + + /* Update the pending bits */ + xc->pending = pending; + + /* + * If this is an EOI that's it, no CPPR adjustment done here, + * all we needed was cleanup the stale pending bits and check + * if there's anything left. + */ + if (scan_type == scan_eoi) + return hirq; + + /* + * If we found an interrupt, adjust what the guest CPPR should + * be as if we had just fetched that interrupt from HW. + * + * Note: This can only make xc->cppr smaller as the previous + * loop will only exit with hirq != 0 if prio is lower than + * the current xc->cppr. Thus we don't need to re-check xc->mfrr + * for pending IPIs. + */ + if (hirq) + xc->cppr = prio; + /* + * If it was an IPI the HW CPPR might have been lowered too much + * as the HW interrupt we use for IPIs is routed to priority 0. + * + * We re-sync it here. + */ + if (xc->cppr != xc->hw_cppr) { + xc->hw_cppr = xc->cppr; + __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR); + } + + return hirq; +} + +static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + u8 old_cppr; + u32 hirq; + + pr_devel("H_XIRR\n"); + + xc->stat_vm_h_xirr++; + + /* First collect pending bits from HW */ + xive_vm_ack_pending(xc); + + pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", + xc->pending, xc->hw_cppr, xc->cppr); + + /* Grab previous CPPR and reverse map it */ + old_cppr = xive_prio_to_guest(xc->cppr); + + /* Scan for actual interrupts */ + hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch); + + pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n", + hirq, xc->hw_cppr, xc->cppr); + + /* That should never hit */ + if (hirq & 0xff000000) + pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq); + + /* + * XXX We could check if the interrupt is masked here and + * filter it. If we chose to do so, we would need to do: + * + * if (masked) { + * lock(); + * if (masked) { + * old_Q = true; + * hirq = 0; + * } + * unlock(); + * } + */ + + /* Return interrupt and old CPPR in GPR4 */ + vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24); + + return H_SUCCESS; +} + +static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + u8 pending = xc->pending; + u32 hirq; + + pr_devel("H_IPOLL(server=%ld)\n", server); + + xc->stat_vm_h_ipoll++; + + /* Grab the target VCPU if not the current one */ + if (xc->server_num != server) { + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); + if (!vcpu) + return H_PARAMETER; + xc = vcpu->arch.xive_vcpu; + + /* Scan all priorities */ + pending = 0xff; + } else { + /* Grab pending interrupt if any */ + __be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; + + if (pipr < 8) + pending |= 1 << pipr; + } + + hirq = xive_vm_scan_interrupts(xc, pending, scan_poll); + + /* Return interrupt and old CPPR in GPR4 */ + vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24); + + return H_SUCCESS; +} + +static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc) +{ + u8 pending, prio; + + pending = xc->pending; + if (xc->mfrr != 0xff) { + if (xc->mfrr < 8) + pending |= 1 << xc->mfrr; + else + pending |= 0x80; + } + if (!pending) + return; + prio = ffs(pending) - 1; + + __raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING); +} + +static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive, + struct kvmppc_xive_vcpu *xc) +{ + unsigned int prio; + + /* For each priority that is now masked */ + for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) { + struct xive_q *q = &xc->queues[prio]; + struct kvmppc_xive_irq_state *state; + struct kvmppc_xive_src_block *sb; + u32 idx, toggle, entry, irq, hw_num; + struct xive_irq_data *xd; + __be32 *qpage; + u16 src; + + idx = q->idx; + toggle = q->toggle; + qpage = READ_ONCE(q->qpage); + if (!qpage) + continue; + + /* For each interrupt in the queue */ + for (;;) { + entry = be32_to_cpup(qpage + idx); + + /* No more ? */ + if ((entry >> 31) == toggle) + break; + irq = entry & 0x7fffffff; + + /* Skip dummies and IPIs */ + if (irq == XICS_DUMMY || irq == XICS_IPI) + goto next; + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) + goto next; + state = &sb->irq_state[src]; + + /* Has it been rerouted ? */ + if (xc->server_num == state->act_server) + goto next; + + /* + * Allright, it *has* been re-routed, kill it from + * the queue. + */ + qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY); + + /* Find the HW interrupt */ + kvmppc_xive_select_irq(state, &hw_num, &xd); + + /* If it's not an LSI, set PQ to 11 the EOI will force a resend */ + if (!(xd->flags & XIVE_IRQ_FLAG_LSI)) + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11); + + /* EOI the source */ + xive_vm_source_eoi(hw_num, xd); + +next: + idx = (idx + 1) & q->msk; + if (idx == 0) + toggle ^= 1; + } + } +} + +static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; + u8 old_cppr; + + pr_devel("H_CPPR(cppr=%ld)\n", cppr); + + xc->stat_vm_h_cppr++; + + /* Map CPPR */ + cppr = xive_prio_from_guest(cppr); + + /* Remember old and update SW state */ + old_cppr = xc->cppr; + xc->cppr = cppr; + + /* + * Order the above update of xc->cppr with the subsequent + * read of xc->mfrr inside push_pending_to_hw() + */ + smp_mb(); + + if (cppr > old_cppr) { + /* + * We are masking less, we need to look for pending things + * to deliver and set VP pending bits accordingly to trigger + * a new interrupt otherwise we might miss MFRR changes for + * which we have optimized out sending an IPI signal. + */ + xive_vm_push_pending_to_hw(xc); + } else { + /* + * We are masking more, we need to check the queue for any + * interrupt that has been routed to another CPU, take + * it out (replace it with the dummy) and retrigger it. + * + * This is necessary since those interrupts may otherwise + * never be processed, at least not until this CPU restores + * its CPPR. + * + * This is in theory racy vs. HW adding new interrupts to + * the queue. In practice this works because the interesting + * cases are when the guest has done a set_xive() to move the + * interrupt away, which flushes the xive, followed by the + * target CPU doing a H_CPPR. So any new interrupt coming into + * the queue must still be routed to us and isn't a source + * of concern. + */ + xive_vm_scan_for_rerouted_irqs(xive, xc); + } + + /* Apply new CPPR */ + xc->hw_cppr = cppr; + __raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR); + + return H_SUCCESS; +} + +static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct xive_irq_data *xd; + u8 new_cppr = xirr >> 24; + u32 irq = xirr & 0x00ffffff, hw_num; + u16 src; + int rc = 0; + + pr_devel("H_EOI(xirr=%08lx)\n", xirr); + + xc->stat_vm_h_eoi++; + + xc->cppr = xive_prio_from_guest(new_cppr); + + /* + * IPIs are synthetized from MFRR and thus don't need + * any special EOI handling. The underlying interrupt + * used to signal MFRR changes is EOId when fetched from + * the queue. + */ + if (irq == XICS_IPI || irq == 0) { + /* + * This barrier orders the setting of xc->cppr vs. + * subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); + goto bail; + } + + /* Find interrupt source */ + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) { + pr_devel(" source not found !\n"); + rc = H_PARAMETER; + /* Same as above */ + smp_mb(); + goto bail; + } + state = &sb->irq_state[src]; + kvmppc_xive_select_irq(state, &hw_num, &xd); + + state->in_eoi = true; + + /* + * This barrier orders both setting of in_eoi above vs, + * subsequent test of guest_priority, and the setting + * of xc->cppr vs. subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); + +again: + if (state->guest_priority == MASKED) { + arch_spin_lock(&sb->lock); + if (state->guest_priority != MASKED) { + arch_spin_unlock(&sb->lock); + goto again; + } + pr_devel(" EOI on saved P...\n"); + + /* Clear old_p, that will cause unmask to perform an EOI */ + state->old_p = false; + + arch_spin_unlock(&sb->lock); + } else { + pr_devel(" EOI on source...\n"); + + /* Perform EOI on the source */ + xive_vm_source_eoi(hw_num, xd); + + /* If it's an emulated LSI, check level and resend */ + if (state->lsi && state->asserted) + __raw_writeq(0, __x_trig_page(xd)); + + } + + /* + * This barrier orders the above guest_priority check + * and spin_lock/unlock with clearing in_eoi below. + * + * It also has to be a full mb() as it must ensure + * the MMIOs done in source_eoi() are completed before + * state->in_eoi is visible. + */ + mb(); + state->in_eoi = false; +bail: + + /* Re-evaluate pending IRQs and update HW */ + xive_vm_scan_interrupts(xc, xc->pending, scan_eoi); + xive_vm_push_pending_to_hw(xc); + pr_devel(" after scan pending=%02x\n", xc->pending); + + /* Apply new CPPR */ + xc->hw_cppr = xc->cppr; + __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR); + + return rc; +} + +static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr); + + xc->stat_vm_h_ipi++; + + /* Find target */ + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); + if (!vcpu) + return H_PARAMETER; + xc = vcpu->arch.xive_vcpu; + + /* Locklessly write over MFRR */ + xc->mfrr = mfrr; + + /* + * The load of xc->cppr below and the subsequent MMIO store + * to the IPI must happen after the above mfrr update is + * globally visible so that: + * + * - Synchronize with another CPU doing an H_EOI or a H_CPPR + * updating xc->cppr then reading xc->mfrr. + * + * - The target of the IPI sees the xc->mfrr update + */ + mb(); + + /* Shoot the IPI if most favored than target cppr */ + if (mfrr < xc->cppr) + __raw_writeq(0, __x_trig_page(&xc->vp_ipi_data)); + + return H_SUCCESS; +} /* * We leave a gap of a couple of interrupts in the queue to diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 09d0657596c3..1e48f72e8aa5 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -285,13 +285,6 @@ static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) return cur & 0x7fffffff; } -extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu); -extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server); -extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr); -extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); -extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); - /* * Common Xive routines for XICS-over-XIVE and XIVE native */ diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c deleted file mode 100644 index b0015e05d99a..000000000000 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ /dev/null @@ -1,636 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation - */ - -/* File to be included by other .c files */ - -#define XGLUE(a,b) a##b -#define GLUE(a,b) XGLUE(a,b) - -/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */ -#define XICS_DUMMY 1 - -static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) -{ - u8 cppr; - u16 ack; - - /* - * Ensure any previous store to CPPR is ordered vs. - * the subsequent loads from PIPR or ACK. - */ - eieio(); - - /* Perform the acknowledge OS to register cycle. */ - ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); - - /* Synchronize subsequent queue accesses */ - mb(); - - /* XXX Check grouping level */ - - /* Anything ? */ - if (!((ack >> 8) & TM_QW1_NSR_EO)) - return; - - /* Grab CPPR of the most favored pending interrupt */ - cppr = ack & 0xff; - if (cppr < 8) - xc->pending |= 1 << cppr; - -#ifdef XIVE_RUNTIME_CHECKS - /* Check consistency */ - if (cppr >= xc->hw_cppr) - pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n", - smp_processor_id(), cppr, xc->hw_cppr); -#endif - - /* - * Update our image of the HW CPPR. We don't yet modify - * xc->cppr, this will be done as we scan for interrupts - * in the queues. - */ - xc->hw_cppr = cppr; -} - -static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset) -{ - u64 val; - - if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - offset |= XIVE_ESB_LD_ST_MO; - - val =__x_readq(__x_eoi_page(xd) + offset); -#ifdef __LITTLE_ENDIAN__ - val >>= 64-8; -#endif - return (u8)val; -} - - -static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) -{ - /* If the XIVE supports the new "store EOI facility, use it */ - if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - __x_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); - else if (xd->flags & XIVE_IRQ_FLAG_LSI) { - /* - * For LSIs the HW EOI cycle is used rather than PQ bits, - * as they are automatically re-triggred in HW when still - * pending. - */ - __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); - } else { - uint64_t eoi_val; - - /* - * Otherwise for EOI, we use the special MMIO that does - * a clear of both P and Q and returns the old Q, - * except for LSIs where we use the "EOI cycle" special - * load. - * - * This allows us to then do a re-trigger if Q was set - * rather than synthetizing an interrupt in software - */ - eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); - - /* Re-trigger if needed */ - if ((eoi_val & 1) && __x_trig_page(xd)) - __x_writeq(0, __x_trig_page(xd)); - } -} - -enum { - scan_fetch, - scan_poll, - scan_eoi, -}; - -static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc, - u8 pending, int scan_type) -{ - u32 hirq = 0; - u8 prio = 0xff; - - /* Find highest pending priority */ - while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) { - struct xive_q *q; - u32 idx, toggle; - __be32 *qpage; - - /* - * If pending is 0 this will return 0xff which is what - * we want - */ - prio = ffs(pending) - 1; - - /* Don't scan past the guest cppr */ - if (prio >= xc->cppr || prio > 7) { - if (xc->mfrr < xc->cppr) { - prio = xc->mfrr; - hirq = XICS_IPI; - } - break; - } - - /* Grab queue and pointers */ - q = &xc->queues[prio]; - idx = q->idx; - toggle = q->toggle; - - /* - * Snapshot the queue page. The test further down for EOI - * must use the same "copy" that was used by __xive_read_eq - * since qpage can be set concurrently and we don't want - * to miss an EOI. - */ - qpage = READ_ONCE(q->qpage); - -skip_ipi: - /* - * Try to fetch from the queue. Will return 0 for a - * non-queueing priority (ie, qpage = 0). - */ - hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle); - - /* - * If this was a signal for an MFFR change done by - * H_IPI we skip it. Additionally, if we were fetching - * we EOI it now, thus re-enabling reception of a new - * such signal. - * - * We also need to do that if prio is 0 and we had no - * page for the queue. In this case, we have non-queued - * IPI that needs to be EOId. - * - * This is safe because if we have another pending MFRR - * change that wasn't observed above, the Q bit will have - * been set and another occurrence of the IPI will trigger. - */ - if (hirq == XICS_IPI || (prio == 0 && !qpage)) { - if (scan_type == scan_fetch) { - GLUE(X_PFX,source_eoi)(xc->vp_ipi, - &xc->vp_ipi_data); - q->idx = idx; - q->toggle = toggle; - } - /* Loop back on same queue with updated idx/toggle */ -#ifdef XIVE_RUNTIME_CHECKS - WARN_ON(hirq && hirq != XICS_IPI); -#endif - if (hirq) - goto skip_ipi; - } - - /* If it's the dummy interrupt, continue searching */ - if (hirq == XICS_DUMMY) - goto skip_ipi; - - /* Clear the pending bit if the queue is now empty */ - if (!hirq) { - pending &= ~(1 << prio); - - /* - * Check if the queue count needs adjusting due to - * interrupts being moved away. - */ - if (atomic_read(&q->pending_count)) { - int p = atomic_xchg(&q->pending_count, 0); - if (p) { -#ifdef XIVE_RUNTIME_CHECKS - WARN_ON(p > atomic_read(&q->count)); -#endif - atomic_sub(p, &q->count); - } - } - } - - /* - * If the most favoured prio we found pending is less - * favored (or equal) than a pending IPI, we return - * the IPI instead. - */ - if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { - prio = xc->mfrr; - hirq = XICS_IPI; - break; - } - - /* If fetching, update queue pointers */ - if (scan_type == scan_fetch) { - q->idx = idx; - q->toggle = toggle; - } - } - - /* If we are just taking a "peek", do nothing else */ - if (scan_type == scan_poll) - return hirq; - - /* Update the pending bits */ - xc->pending = pending; - - /* - * If this is an EOI that's it, no CPPR adjustment done here, - * all we needed was cleanup the stale pending bits and check - * if there's anything left. - */ - if (scan_type == scan_eoi) - return hirq; - - /* - * If we found an interrupt, adjust what the guest CPPR should - * be as if we had just fetched that interrupt from HW. - * - * Note: This can only make xc->cppr smaller as the previous - * loop will only exit with hirq != 0 if prio is lower than - * the current xc->cppr. Thus we don't need to re-check xc->mfrr - * for pending IPIs. - */ - if (hirq) - xc->cppr = prio; - /* - * If it was an IPI the HW CPPR might have been lowered too much - * as the HW interrupt we use for IPIs is routed to priority 0. - * - * We re-sync it here. - */ - if (xc->cppr != xc->hw_cppr) { - xc->hw_cppr = xc->cppr; - __x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR); - } - - return hirq; -} - -X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - u8 old_cppr; - u32 hirq; - - pr_devel("H_XIRR\n"); - - xc->GLUE(X_STAT_PFX,h_xirr)++; - - /* First collect pending bits from HW */ - GLUE(X_PFX,ack_pending)(xc); - - pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", - xc->pending, xc->hw_cppr, xc->cppr); - - /* Grab previous CPPR and reverse map it */ - old_cppr = xive_prio_to_guest(xc->cppr); - - /* Scan for actual interrupts */ - hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch); - - pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n", - hirq, xc->hw_cppr, xc->cppr); - -#ifdef XIVE_RUNTIME_CHECKS - /* That should never hit */ - if (hirq & 0xff000000) - pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq); -#endif - - /* - * XXX We could check if the interrupt is masked here and - * filter it. If we chose to do so, we would need to do: - * - * if (masked) { - * lock(); - * if (masked) { - * old_Q = true; - * hirq = 0; - * } - * unlock(); - * } - */ - - /* Return interrupt and old CPPR in GPR4 */ - vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24); - - return H_SUCCESS; -} - -X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - u8 pending = xc->pending; - u32 hirq; - - pr_devel("H_IPOLL(server=%ld)\n", server); - - xc->GLUE(X_STAT_PFX,h_ipoll)++; - - /* Grab the target VCPU if not the current one */ - if (xc->server_num != server) { - vcpu = kvmppc_xive_find_server(vcpu->kvm, server); - if (!vcpu) - return H_PARAMETER; - xc = vcpu->arch.xive_vcpu; - - /* Scan all priorities */ - pending = 0xff; - } else { - /* Grab pending interrupt if any */ - __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); - u8 pipr = be64_to_cpu(qw1) & 0xff; - if (pipr < 8) - pending |= 1 << pipr; - } - - hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll); - - /* Return interrupt and old CPPR in GPR4 */ - vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24); - - return H_SUCCESS; -} - -static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc) -{ - u8 pending, prio; - - pending = xc->pending; - if (xc->mfrr != 0xff) { - if (xc->mfrr < 8) - pending |= 1 << xc->mfrr; - else - pending |= 0x80; - } - if (!pending) - return; - prio = ffs(pending) - 1; - - __x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING); -} - -static void GLUE(X_PFX,scan_for_rerouted_irqs)(struct kvmppc_xive *xive, - struct kvmppc_xive_vcpu *xc) -{ - unsigned int prio; - - /* For each priority that is now masked */ - for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) { - struct xive_q *q = &xc->queues[prio]; - struct kvmppc_xive_irq_state *state; - struct kvmppc_xive_src_block *sb; - u32 idx, toggle, entry, irq, hw_num; - struct xive_irq_data *xd; - __be32 *qpage; - u16 src; - - idx = q->idx; - toggle = q->toggle; - qpage = READ_ONCE(q->qpage); - if (!qpage) - continue; - - /* For each interrupt in the queue */ - for (;;) { - entry = be32_to_cpup(qpage + idx); - - /* No more ? */ - if ((entry >> 31) == toggle) - break; - irq = entry & 0x7fffffff; - - /* Skip dummies and IPIs */ - if (irq == XICS_DUMMY || irq == XICS_IPI) - goto next; - sb = kvmppc_xive_find_source(xive, irq, &src); - if (!sb) - goto next; - state = &sb->irq_state[src]; - - /* Has it been rerouted ? */ - if (xc->server_num == state->act_server) - goto next; - - /* - * Allright, it *has* been re-routed, kill it from - * the queue. - */ - qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY); - - /* Find the HW interrupt */ - kvmppc_xive_select_irq(state, &hw_num, &xd); - - /* If it's not an LSI, set PQ to 11 the EOI will force a resend */ - if (!(xd->flags & XIVE_IRQ_FLAG_LSI)) - GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_11); - - /* EOI the source */ - GLUE(X_PFX,source_eoi)(hw_num, xd); - - next: - idx = (idx + 1) & q->msk; - if (idx == 0) - toggle ^= 1; - } - } -} - -X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct kvmppc_xive *xive = vcpu->kvm->arch.xive; - u8 old_cppr; - - pr_devel("H_CPPR(cppr=%ld)\n", cppr); - - xc->GLUE(X_STAT_PFX,h_cppr)++; - - /* Map CPPR */ - cppr = xive_prio_from_guest(cppr); - - /* Remember old and update SW state */ - old_cppr = xc->cppr; - xc->cppr = cppr; - - /* - * Order the above update of xc->cppr with the subsequent - * read of xc->mfrr inside push_pending_to_hw() - */ - smp_mb(); - - if (cppr > old_cppr) { - /* - * We are masking less, we need to look for pending things - * to deliver and set VP pending bits accordingly to trigger - * a new interrupt otherwise we might miss MFRR changes for - * which we have optimized out sending an IPI signal. - */ - GLUE(X_PFX,push_pending_to_hw)(xc); - } else { - /* - * We are masking more, we need to check the queue for any - * interrupt that has been routed to another CPU, take - * it out (replace it with the dummy) and retrigger it. - * - * This is necessary since those interrupts may otherwise - * never be processed, at least not until this CPU restores - * its CPPR. - * - * This is in theory racy vs. HW adding new interrupts to - * the queue. In practice this works because the interesting - * cases are when the guest has done a set_xive() to move the - * interrupt away, which flushes the xive, followed by the - * target CPU doing a H_CPPR. So any new interrupt coming into - * the queue must still be routed to us and isn't a source - * of concern. - */ - GLUE(X_PFX,scan_for_rerouted_irqs)(xive, xc); - } - - /* Apply new CPPR */ - xc->hw_cppr = cppr; - __x_writeb(cppr, __x_tima + TM_QW1_OS + TM_CPPR); - - return H_SUCCESS; -} - -X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) -{ - struct kvmppc_xive *xive = vcpu->kvm->arch.xive; - struct kvmppc_xive_src_block *sb; - struct kvmppc_xive_irq_state *state; - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct xive_irq_data *xd; - u8 new_cppr = xirr >> 24; - u32 irq = xirr & 0x00ffffff, hw_num; - u16 src; - int rc = 0; - - pr_devel("H_EOI(xirr=%08lx)\n", xirr); - - xc->GLUE(X_STAT_PFX,h_eoi)++; - - xc->cppr = xive_prio_from_guest(new_cppr); - - /* - * IPIs are synthetized from MFRR and thus don't need - * any special EOI handling. The underlying interrupt - * used to signal MFRR changes is EOId when fetched from - * the queue. - */ - if (irq == XICS_IPI || irq == 0) { - /* - * This barrier orders the setting of xc->cppr vs. - * subsquent test of xc->mfrr done inside - * scan_interrupts and push_pending_to_hw - */ - smp_mb(); - goto bail; - } - - /* Find interrupt source */ - sb = kvmppc_xive_find_source(xive, irq, &src); - if (!sb) { - pr_devel(" source not found !\n"); - rc = H_PARAMETER; - /* Same as above */ - smp_mb(); - goto bail; - } - state = &sb->irq_state[src]; - kvmppc_xive_select_irq(state, &hw_num, &xd); - - state->in_eoi = true; - - /* - * This barrier orders both setting of in_eoi above vs, - * subsequent test of guest_priority, and the setting - * of xc->cppr vs. subsquent test of xc->mfrr done inside - * scan_interrupts and push_pending_to_hw - */ - smp_mb(); - -again: - if (state->guest_priority == MASKED) { - arch_spin_lock(&sb->lock); - if (state->guest_priority != MASKED) { - arch_spin_unlock(&sb->lock); - goto again; - } - pr_devel(" EOI on saved P...\n"); - - /* Clear old_p, that will cause unmask to perform an EOI */ - state->old_p = false; - - arch_spin_unlock(&sb->lock); - } else { - pr_devel(" EOI on source...\n"); - - /* Perform EOI on the source */ - GLUE(X_PFX,source_eoi)(hw_num, xd); - - /* If it's an emulated LSI, check level and resend */ - if (state->lsi && state->asserted) - __x_writeq(0, __x_trig_page(xd)); - - } - - /* - * This barrier orders the above guest_priority check - * and spin_lock/unlock with clearing in_eoi below. - * - * It also has to be a full mb() as it must ensure - * the MMIOs done in source_eoi() are completed before - * state->in_eoi is visible. - */ - mb(); - state->in_eoi = false; -bail: - - /* Re-evaluate pending IRQs and update HW */ - GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi); - GLUE(X_PFX,push_pending_to_hw)(xc); - pr_devel(" after scan pending=%02x\n", xc->pending); - - /* Apply new CPPR */ - xc->hw_cppr = xc->cppr; - __x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR); - - return rc; -} - -X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - - pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr); - - xc->GLUE(X_STAT_PFX,h_ipi)++; - - /* Find target */ - vcpu = kvmppc_xive_find_server(vcpu->kvm, server); - if (!vcpu) - return H_PARAMETER; - xc = vcpu->arch.xive_vcpu; - - /* Locklessly write over MFRR */ - xc->mfrr = mfrr; - - /* - * The load of xc->cppr below and the subsequent MMIO store - * to the IPI must happen after the above mfrr update is - * globally visible so that: - * - * - Synchronize with another CPU doing an H_EOI or a H_CPPR - * updating xc->cppr then reading xc->mfrr. - * - * - The target of the IPI sees the xc->mfrr update - */ - mb(); - - /* Shoot the IPI if most favored than target cppr */ - if (mfrr < xc->cppr) - __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); - - return H_SUCCESS; -} -- cgit v1.2.3 From 2f82ec19757f58549467db568c56e7dfff8af283 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 8 Mar 2022 04:27:33 +1000 Subject: powerpc/64: Bump SIGSTKSZ and MINSIGSTKSZ The sad tale of SIGSTKSZ and MINSIGSTKSZ is documented in glibc.git commit f7c399cff5bd ("PowerPC SIGSTKSZ"), which explains why glibc does not use the kernel defines for these constants. Since then in fact there has been a further expansion of the signal stack frame size on little-endian with linux commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit userspace to 512 bytes"), which has caused it to exceed even the glibc defines. See kernel commit 63dee5df43a3 ("powerpc: Allow 4224 bytes of stack expansion for the signal frame") for more details of the history of the expansion. Increase MINSIGSTKSZ to 8192 which is double the current glibc value and fits the current stack frame with room to grow. SIGSTKSZ is set to 4x the minimum as convention. glibc will have to be updated as well. Reviewed-by: Tulio Magno Quites Machado Filho Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220307182734.289289-1-npiggin@gmail.com --- arch/powerpc/include/uapi/asm/signal.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/uapi/asm/signal.h b/arch/powerpc/include/uapi/asm/signal.h index 37d41d87c45b..a5dfe84f50ab 100644 --- a/arch/powerpc/include/uapi/asm/signal.h +++ b/arch/powerpc/include/uapi/asm/signal.h @@ -62,8 +62,13 @@ typedef struct { #define SA_RESTORER 0x04000000U +#ifdef __powerpc64__ +#define MINSIGSTKSZ 8192 +#define SIGSTKSZ 32768 +#else #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 +#endif #include -- cgit v1.2.3 From 2896b2dff49d0377e4372f470dcddbcb26f2be59 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 8 Mar 2022 04:27:34 +1000 Subject: powerpc/signal: Report minimum signal frame size to userspace via AT_MINSIGSTKSZ Implement the AT_MINSIGSTKSZ AUXV entry, allowing userspace to dynamically size stack allocations in a manner forward-compatible with new processor state saved in the signal frame For now these statically find the maximum signal frame size rather than doing any runtime testing of features to minimise the size. glibc 2.34 will take advantage of this, as will applications that use use _SC_MINSIGSTKSZ and _SC_SIGSTKSZ. Reviewed-by: Tulio Magno Quites Machado Filho Signed-off-by: Nicholas Piggin References: 94b07c1f8c39 ("arm64: signal: Report signal frame size to userspace via auxv") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220307182734.289289-2-npiggin@gmail.com --- arch/powerpc/include/asm/elf.h | 14 +++++++++++++- arch/powerpc/include/asm/signal.h | 5 +++++ arch/powerpc/include/uapi/asm/auxvec.h | 4 +++- arch/powerpc/kernel/signal.c | 15 +++++++++++++++ arch/powerpc/kernel/signal_32.c | 6 ++++++ arch/powerpc/kernel/signal_64.c | 5 +++++ 6 files changed, 47 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 971589a21bc0..79f1c480b5eb 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -160,7 +160,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, * even if DLINFO_ARCH_ITEMS goes to zero or is undefined. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO \ +#define COMMON_ARCH_DLINFO \ do { \ /* Handle glibc compatibility. */ \ NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC); \ @@ -173,6 +173,18 @@ do { \ ARCH_DLINFO_CACHE_GEOMETRY; \ } while (0) +#define ARCH_DLINFO \ +do { \ + COMMON_ARCH_DLINFO; \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, get_min_sigframe_size()); \ +} while (0) + +#define COMPAT_ARCH_DLINFO \ +do { \ + COMMON_ARCH_DLINFO; \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, get_min_sigframe_size_compat()); \ +} while (0) + /* Relocate the kernel image to @final_address */ void relocate(unsigned long final_address); diff --git a/arch/powerpc/include/asm/signal.h b/arch/powerpc/include/asm/signal.h index 99e1c6de27bc..922d43700fb4 100644 --- a/arch/powerpc/include/asm/signal.h +++ b/arch/powerpc/include/asm/signal.h @@ -9,4 +9,9 @@ struct pt_regs; void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); +unsigned long get_min_sigframe_size_32(void); +unsigned long get_min_sigframe_size_64(void); +unsigned long get_min_sigframe_size(void); +unsigned long get_min_sigframe_size_compat(void); + #endif /* _ASM_POWERPC_SIGNAL_H */ diff --git a/arch/powerpc/include/uapi/asm/auxvec.h b/arch/powerpc/include/uapi/asm/auxvec.h index 7af21dc0e320..aa7c16215453 100644 --- a/arch/powerpc/include/uapi/asm/auxvec.h +++ b/arch/powerpc/include/uapi/asm/auxvec.h @@ -48,6 +48,8 @@ #define AT_L3_CACHESIZE 46 #define AT_L3_CACHEGEOMETRY 47 -#define AT_VECTOR_SIZE_ARCH 14 /* entries in ARCH_DLINFO */ +#define AT_MINSIGSTKSZ 51 /* stack needed for signal delivery */ + +#define AT_VECTOR_SIZE_ARCH 15 /* entries in ARCH_DLINFO */ #endif diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index f7f8620663c7..68a91e553e14 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -141,6 +141,21 @@ unsigned long copy_ckvsx_from_user(struct task_struct *task, int show_unhandled_signals = 1; +unsigned long get_min_sigframe_size(void) +{ + if (IS_ENABLED(CONFIG_PPC64)) + return get_min_sigframe_size_64(); + else + return get_min_sigframe_size_32(); +} + +#ifdef CONFIG_COMPAT +unsigned long get_min_sigframe_size_compat(void) +{ + return get_min_sigframe_size_32(); +} +#endif + /* * Allocate space for the signal frame */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index d84c434b2b78..157a7403e3eb 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -233,6 +233,12 @@ struct rt_sigframe { int abigap[56]; }; +unsigned long get_min_sigframe_size_32(void) +{ + return max(sizeof(struct rt_sigframe) + __SIGNAL_FRAMESIZE + 16, + sizeof(struct sigframe) + __SIGNAL_FRAMESIZE); +} + /* * Save the current user registers on the user stack. * We only save the altivec/spe registers if the process has used diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 858fc13b8c51..472596a109e2 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -66,6 +66,11 @@ struct rt_sigframe { char abigap[USER_REDZONE_SIZE]; } __attribute__ ((aligned (16))); +unsigned long get_min_sigframe_size_64(void) +{ + return sizeof(struct rt_sigframe) + __SIGNAL_FRAMESIZE; +} + /* * This computes a quad word aligned pointer inside the vmx_reserve array * element. For historical reasons sigcontext might not be quad word aligned, -- cgit v1.2.3 From 1acbf27e8a5843911d122ad0008e79ec5f7b6382 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:01 +0200 Subject: powerpc/code-patching: Inline is_offset_in_{cond}_branch_range() Test in is_offset_in_branch_range() and is_offset_in_cond_branch_range() are simple tests that are worth inlining. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a05be0ccb7373e6a9789a1988fcd0c810f5f9269.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 29 +++++++++++++++++++++++++++-- arch/powerpc/lib/code-patching.c | 27 --------------------------- 2 files changed, 27 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index bccc3a538b9f..378b70545a32 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -24,8 +24,33 @@ DECLARE_STATIC_KEY_FALSE(init_mem_is_free); -bool is_offset_in_branch_range(long offset); -bool is_offset_in_cond_branch_range(long offset); +/* + * Powerpc branch instruction is : + * + * 0 6 30 31 + * +---------+----------------+---+---+ + * | opcode | LI |AA |LK | + * +---------+----------------+---+---+ + * Where AA = 0 and LK = 0 + * + * LI is a signed 24 bits integer. The real branch offset is computed + * by: imm32 = SignExtend(LI:'0b00', 32); + * + * So the maximum forward branch should be: + * (0x007fffff << 2) = 0x01fffffc = 0x1fffffc + * The maximum backward branch should be: + * (0xff800000 << 2) = 0xfe000000 = -0x2000000 + */ +static inline bool is_offset_in_branch_range(long offset) +{ + return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); +} + +static inline bool is_offset_in_cond_branch_range(long offset) +{ + return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); +} + int create_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags); int create_cond_branch(ppc_inst_t *instr, const u32 *addr, diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index be670e1abafa..13a5a386d35b 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -214,33 +214,6 @@ int patch_branch(u32 *addr, unsigned long target, int flags) return patch_instruction(addr, instr); } -bool is_offset_in_branch_range(long offset) -{ - /* - * Powerpc branch instruction is : - * - * 0 6 30 31 - * +---------+----------------+---+---+ - * | opcode | LI |AA |LK | - * +---------+----------------+---+---+ - * Where AA = 0 and LK = 0 - * - * LI is a signed 24 bits integer. The real branch offset is computed - * by: imm32 = SignExtend(LI:'0b00', 32); - * - * So the maximum forward branch should be: - * (0x007fffff << 2) = 0x01fffffc = 0x1fffffc - * The maximum backward branch should be: - * (0xff800000 << 2) = 0xfe000000 = -0x2000000 - */ - return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); -} - -bool is_offset_in_cond_branch_range(long offset) -{ - return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); -} - /* * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() -- cgit v1.2.3 From d2f47dabf1252520a88d257133e6bdec474fd935 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:03 +0200 Subject: powerpc/code-patching: Inline create_branch() create_branch() is a good candidate for inlining because: - Flags can be folded in. - Range tests are likely to be already done. Hence reducing the create_branch() to only a set of instructions. So inline it. It improves ftrace activation by 10%. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/69851cc9a7bf8f03d025e6d29e165f2d0bd3bb6e.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 22 ++++++++++++++++++++-- arch/powerpc/lib/code-patching.c | 20 -------------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 378b70545a32..947d4ae062f5 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -51,8 +51,26 @@ static inline bool is_offset_in_cond_branch_range(long offset) return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); } -int create_branch(ppc_inst_t *instr, const u32 *addr, - unsigned long target, int flags); +static inline int create_branch(ppc_inst_t *instr, const u32 *addr, + unsigned long target, int flags) +{ + long offset; + + *instr = ppc_inst(0); + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (!is_offset_in_branch_range(offset)) + return 1; + + /* Mask out the flags and target, so they don't step on each other. */ + *instr = ppc_inst(0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC)); + + return 0; +} + int create_cond_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags); int patch_branch(u32 *addr, unsigned long target, int flags); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 13a5a386d35b..46ffce27135e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -236,26 +236,6 @@ bool is_conditional_branch(ppc_inst_t instr) } NOKPROBE_SYMBOL(is_conditional_branch); -int create_branch(ppc_inst_t *instr, const u32 *addr, - unsigned long target, int flags) -{ - long offset; - - *instr = ppc_inst(0); - offset = target; - if (! (flags & BRANCH_ABSOLUTE)) - offset = offset - (unsigned long)addr; - - /* Check we can represent the target in the instruction format */ - if (!is_offset_in_branch_range(offset)) - return 1; - - /* Mask out the flags and target, so they don't step on each other. */ - *instr = ppc_inst(0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC)); - - return 0; -} - int create_cond_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags) { -- cgit v1.2.3 From 7d40aff8213c92e64a1576ba9dfebcd201c0564d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:07 +0200 Subject: powerpc: Replace PPC64_ELF_ABI_v{1/2} by CONFIG_PPC64_ELF_ABI_V{1/2} Replace all uses of PPC64_ELF_ABI_v1 and PPC64_ELF_ABI_v2 by resp CONFIG_PPC64_ELF_ABI_V1 and CONFIG_PPC64_ELF_ABI_V2. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ba13d59e8c50bc9aa6328f1c7f0c0d0278e0a3a7.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 12 ++++++------ arch/powerpc/include/asm/ftrace.h | 4 ++-- arch/powerpc/include/asm/linkage.h | 2 +- arch/powerpc/include/asm/ppc_asm.h | 4 ++-- arch/powerpc/include/asm/ptrace.h | 2 +- arch/powerpc/kernel/head_64.S | 2 +- arch/powerpc/kernel/interrupt_64.S | 2 +- arch/powerpc/kernel/kprobes.c | 6 +++--- arch/powerpc/kernel/misc_64.S | 2 +- arch/powerpc/kernel/module.c | 4 ++-- arch/powerpc/kernel/module_64.c | 4 ++-- arch/powerpc/kernel/ptrace/ptrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 4 ++-- arch/powerpc/kvm/book3s_interrupts.S | 2 +- arch/powerpc/kvm/book3s_rmhandlers.S | 2 +- arch/powerpc/net/bpf_jit.h | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- 18 files changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 947d4ae062f5..2f73c0900040 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -132,7 +132,7 @@ bool is_conditional_branch(ppc_inst_t instr); static inline unsigned long ppc_function_entry(void *func) { -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 u32 *insn = func; /* @@ -157,7 +157,7 @@ static inline unsigned long ppc_function_entry(void *func) return (unsigned long)(insn + 2); else return (unsigned long)func; -#elif defined(PPC64_ELF_ABI_v1) +#elif defined(CONFIG_PPC64_ELF_ABI_V1) /* * On PPC64 ABIv1 the function pointer actually points to the * function's descriptor. The first entry in the descriptor is the @@ -171,7 +171,7 @@ static inline unsigned long ppc_function_entry(void *func) static inline unsigned long ppc_global_function_entry(void *func) { -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 /* PPC64 ABIv2 the global entry point is at the address */ return (unsigned long)func; #else @@ -188,7 +188,7 @@ static inline unsigned long ppc_global_function_entry(void *func) static inline unsigned long ppc_kallsyms_lookup_name(const char *name) { unsigned long addr; -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 /* check for dot variant */ char dot_name[1 + KSYM_NAME_LEN]; bool dot_appended = false; @@ -209,7 +209,7 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name) if (!addr && dot_appended) /* Let's try the original non-dot symbol lookup */ addr = kallsyms_lookup_name(name); -#elif defined(PPC64_ELF_ABI_v2) +#elif defined(CONFIG_PPC64_ELF_ABI_V2) addr = kallsyms_lookup_name(name); if (addr) addr = ppc_function_entry((void *)addr); @@ -226,7 +226,7 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name) */ /* This must match the definition of STK_GOT in */ -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define R2_STACK_OFFSET 24 #else #define R2_STACK_OFFSET 40 diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index d83758acd1c7..b56166b7ea68 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -64,7 +64,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, * those. */ #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { /* We need to skip past the initial dot, and the __se_sys alias */ @@ -83,7 +83,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name (!strncmp(sym, "ppc32_", 6) && !strcmp(sym + 6, name + 4)) || (!strncmp(sym, "ppc64_", 6) && !strcmp(sym + 6, name + 4)); } -#endif /* PPC64_ELF_ABI_v1 */ +#endif /* CONFIG_PPC64_ELF_ABI_V1 */ #endif /* CONFIG_FTRACE_SYSCALLS */ #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/linkage.h b/arch/powerpc/include/asm/linkage.h index 1f00d2891d69..b71b9582e754 100644 --- a/arch/powerpc/include/asm/linkage.h +++ b/arch/powerpc/include/asm/linkage.h @@ -4,7 +4,7 @@ #include -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 #define cond_syscall(x) \ asm ("\t.weak " #x "\n\t.set " #x ", sys_ni_syscall\n" \ "\t.weak ." #x "\n\t.set ." #x ", .sys_ni_syscall\n") diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 4dea2d963738..83c02f5a7f2a 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -149,7 +149,7 @@ #define __STK_REG(i) (112 + ((i)-14)*8) #define STK_REG(i) __STK_REG(__REG_##i) -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define STK_GOT 24 #define __STK_PARAM(i) (32 + ((i)-3)*8) #else @@ -158,7 +158,7 @@ #endif #define STK_PARAM(i) __STK_PARAM(__REG_##i) -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define _GLOBAL(name) \ .align 2 ; \ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 42f89e2d8f04..a03403695cd4 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -120,7 +120,7 @@ struct pt_regs STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) #define STACK_FRAME_MARKER 12 -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define STACK_FRAME_MIN_SIZE 32 #else #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 5c5181e8d5f1..f85660d054bd 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -435,7 +435,7 @@ generic_secondary_common_init: ld r12,CPU_SPEC_RESTORE(r23) cmpdi 0,r12,0 beq 3f -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 ld r12,0(r12) #endif mtctr r12 diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 6471034c7909..ce25b28cf418 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -711,7 +711,7 @@ _GLOBAL(ret_from_kernel_thread) REST_NVGPRS(r1) mtctr r14 mr r3,r15 -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 mr r12,r14 #endif bctrl diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index d5f55e5c8971..1c97c0f177ae 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -45,7 +45,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { kprobe_opcode_t *addr = NULL; -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 /* PPC64 ABIv2 needs local entry point */ addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); if (addr && !offset) { @@ -63,7 +63,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) #endif addr = (kprobe_opcode_t *)ppc_function_entry(addr); } -#elif defined(PPC64_ELF_ABI_v1) +#elif defined(CONFIG_PPC64_ELF_ABI_V1) /* * 64bit powerpc ABIv1 uses function descriptors: * - Check for the dot variant of the symbol first. @@ -107,7 +107,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) static bool arch_kprobe_on_func_entry(unsigned long offset) { -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #ifdef CONFIG_KPROBES_ON_FTRACE return offset <= 16; #else diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index d38a019b38e1..fd6d8d3a548e 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -454,7 +454,7 @@ _GLOBAL(kexec_sequence) beq 1f /* clear out hardware hash page table and tlb */ -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 ld r12,0(r27) /* deref function descriptor */ #else mr r12,r27 diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 97a76a8619fb..f6d6ae0a1692 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -64,13 +64,13 @@ int module_finalize(const Elf_Ehdr *hdr, (void *)sect->sh_addr + sect->sh_size); #endif /* CONFIG_PPC64 */ -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 sect = find_section(hdr, sechdrs, ".opd"); if (sect != NULL) { me->arch.start_opd = sect->sh_addr; me->arch.end_opd = sect->sh_addr + sect->sh_size; } -#endif /* PPC64_ELF_ABI_v1 */ +#endif /* CONFIG_PPC64_ELF_ABI_V1 */ #ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 2cce576edbc5..8542ad024400 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -31,7 +31,7 @@ this, and makes other things simpler. Anton? --RR. */ -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 static func_desc_t func_desc(unsigned long addr) { @@ -122,7 +122,7 @@ static u32 ppc64_stub_insns[] = { /* Save current r2 value in magic place on the stack. */ PPC_RAW_STD(_R2, _R1, R2_STACK_OFFSET), PPC_RAW_LD(_R12, _R11, 32), -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 /* Set up new r2 from function descriptor */ PPC_RAW_LD(_R2, _R11, 40), #endif diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 6d5026a9db4f..9fbe155a9bd0 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -445,7 +445,7 @@ void __init pt_regs_check(void) */ BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 BUILD_BUG_ON(!IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); #else BUILD_BUG_ON(IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 1b05d33f96c6..0b199fc9cfd3 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -953,7 +953,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, #endif #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 char *arch_ftrace_match_adjust(char *str, const char *search) { if (str[0] == '.' && search[0] != '.') @@ -961,4 +961,4 @@ char *arch_ftrace_match_adjust(char *str, const char *search) else return str; } -#endif /* PPC64_ELF_ABI_v1 */ +#endif /* CONFIG_PPC64_ELF_ABI_V1 */ diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 25a3679fb590..f4bec2fc51aa 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -15,7 +15,7 @@ #include #if defined(CONFIG_PPC_BOOK3S_64) -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define FUNC(name) name #else #define FUNC(name) GLUE(.,name) diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index b45b750fa77a..03886ca24498 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -26,7 +26,7 @@ #if defined(CONFIG_PPC_BOOK3S_64) -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 #define FUNC(name) name #else #define FUNC(name) GLUE(.,name) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 979701d360da..80d973da9093 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -13,7 +13,7 @@ #include #include -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 #define FUNCTION_DESCR_SIZE 24 #else #define FUNCTION_DESCR_SIZE 0 diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 427185256216..43e634126514 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -276,7 +276,7 @@ skip_codegen_passes: */ bpf_jit_dump(flen, proglen, pass, code_base); -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 /* Function descriptor nastiness: Address + TOC */ ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 585f257da045..d7b42f45669e 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -126,7 +126,7 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; - if (__is_defined(PPC64_ELF_ABI_v2)) + if (__is_defined(CONFIG_PPC64_ELF_ABI_V2)) EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); /* @@ -266,7 +266,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o int b2p_index = bpf_to_ppc(BPF_REG_3); int bpf_tailcall_prologue_size = 8; - if (__is_defined(PPC64_ELF_ABI_v2)) + if (__is_defined(CONFIG_PPC64_ELF_ABI_V2)) bpf_tailcall_prologue_size += 4; /* skip past the toc load */ /* -- cgit v1.2.3 From 5b89492c03e5c0a2c259b97d7d4c1bb9b02860aa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:08 +0200 Subject: powerpc: Finalise cleanup around ABI use Now that we have CONFIG_PPC64_ELF_ABI_V1 and CONFIG_PPC64_ELF_ABI_V2, get rid of all indirect detection of ABI version. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/709d9d69523c14c8a9fba4486395dca0f2d675b1.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/types.h | 8 -------- arch/powerpc/kernel/fadump.c | 13 ++++++++----- arch/powerpc/kernel/ptrace/ptrace.c | 6 ------ arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- 6 files changed, 12 insertions(+), 23 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 145af02df3dc..06b5ef6b089b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -208,7 +208,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_DESCRIPTORS if PPC64 && !CPU_LITTLE_ENDIAN + select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 1ba98be84101..8bd3b631f094 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -213,7 +213,7 @@ CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ ifdef CONFIG_CPU_BIG_ENDIAN CHECKFLAGS += -D__BIG_ENDIAN__ else -CHECKFLAGS += -D__LITTLE_ENDIAN__ -D_CALL_ELF=2 +CHECKFLAGS += -D__LITTLE_ENDIAN__ endif ifdef CONFIG_476FPE_ERR46 diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index 84078c28c1a2..93157a661dcc 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -11,14 +11,6 @@ #include -#ifdef __powerpc64__ -#if defined(_CALL_ELF) && _CALL_ELF == 2 -#define PPC64_ELF_ABI_v2 1 -#else -#define PPC64_ELF_ABI_v1 1 -#endif -#endif /* __powerpc64__ */ - #ifndef __ASSEMBLY__ typedef __vector128 vector128; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index bfb671fe904b..a171d0f6621d 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -978,11 +978,14 @@ static int fadump_init_elfcore_header(char *bufp) elf->e_entry = 0; elf->e_phoff = sizeof(struct elfhdr); elf->e_shoff = 0; -#if defined(_CALL_ELF) - elf->e_flags = _CALL_ELF; -#else - elf->e_flags = 0; -#endif + + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + elf->e_flags = 2; + else if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) + elf->e_flags = 1; + else + elf->e_flags = 0; + elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = 0; diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 9fbe155a9bd0..4d2dc22d4a2d 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -444,10 +444,4 @@ void __init pt_regs_check(void) * real registers. */ BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); - -#ifdef CONFIG_PPC64_ELF_ABI_V1 - BUILD_BUG_ON(!IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); -#else - BUILD_BUG_ON(IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS)); -#endif } diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index d7b42f45669e..594c54931e20 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -126,7 +126,7 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; - if (__is_defined(CONFIG_PPC64_ELF_ABI_V2)) + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); /* @@ -266,7 +266,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o int b2p_index = bpf_to_ppc(BPF_REG_3); int bpf_tailcall_prologue_size = 8; - if (__is_defined(CONFIG_PPC64_ELF_ABI_V2)) + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) bpf_tailcall_prologue_size += 4; /* skip past the toc load */ /* -- cgit v1.2.3 From e89aa642be21b14e53bab40a37b8c6b0cf05143d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:14 +0200 Subject: powerpc/ftrace: Use PPC_RAW_xxx() macros instead of opencoding. PPC_RAW_xxx() macros are self explanatory and less error prone than open coding. Use them in ftrace.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9292094c9a69cef6d29ee83f435a557b59c45065.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 5 +++++ arch/powerpc/kernel/trace/ftrace.c | 32 +++++++++++--------------------- 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 683e9bc618a7..de6534d561f7 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -363,6 +363,10 @@ #define PPC_HIGHER(v) (((v) >> 32) & 0xffff) #define PPC_HIGHEST(v) (((v) >> 48) & 0xffff) +/* LI Field */ +#define PPC_LI_MASK 0x03fffffc +#define PPC_LI(v) ((v) & PPC_LI_MASK) + /* * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a * larx with EH set as an illegal instruction. @@ -583,6 +587,7 @@ #define PPC_RAW_EIEIO() (0x7c0006ac) #define PPC_RAW_BRANCH(addr) (PPC_INST_BRANCH | ((addr) & 0x03fffffc)) +#define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) /* Deal with instructions that older assemblers aren't aware of */ #define PPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index c4a68340a351..ac3f97dd1729 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -90,19 +90,19 @@ static int test_24bit_addr(unsigned long ip, unsigned long addr) static int is_bl_op(ppc_inst_t op) { - return (ppc_inst_val(op) & 0xfc000003) == 0x48000001; + return (ppc_inst_val(op) & ~PPC_LI_MASK) == PPC_RAW_BL(0); } static int is_b_op(ppc_inst_t op) { - return (ppc_inst_val(op) & 0xfc000003) == 0x48000000; + return (ppc_inst_val(op) & ~PPC_LI_MASK) == PPC_RAW_BRANCH(0); } static unsigned long find_bl_target(unsigned long ip, ppc_inst_t op) { int offset; - offset = (ppc_inst_val(op) & 0x03fffffc); + offset = PPC_LI(ppc_inst_val(op)); /* make it signed */ if (offset & 0x02000000) offset |= 0xfe000000; @@ -182,7 +182,7 @@ __ftrace_make_nop(struct module *mod, * Use a b +8 to jump over the load. */ - pop = ppc_inst(PPC_INST_BRANCH | 8); /* b +8 */ + pop = ppc_inst(PPC_RAW_BRANCH(8)); /* b +8 */ /* * Check what is in the next instruction. We can see ld r2,40(r1), but @@ -394,17 +394,8 @@ int ftrace_make_nop(struct module *mod, static int expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) { - /* - * We expect to see: - * - * b +8 - * ld r2,XX(r1) - * - * The load offset is different depending on the ABI. For simplicity - * just mask it out when doing the compare. - */ - if (!ppc_inst_equal(op0, ppc_inst(0x48000008)) || - (ppc_inst_val(op1) & 0xffff0000) != 0xe8410000) + if (!ppc_inst_equal(op0, ppc_inst(PPC_RAW_BRANCH(8))) || + !ppc_inst_equal(op1, ppc_inst(PPC_INST_LD_TOC))) return 0; return 1; } @@ -412,7 +403,6 @@ expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) static int expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) { - /* look for patched "NOP" on ppc64 with -mprofile-kernel or ppc32 */ if (!ppc_inst_equal(op0, ppc_inst(PPC_RAW_NOP()))) return 0; return 1; @@ -738,11 +728,11 @@ int __init ftrace_dyn_arch_init(void) int i; unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; u32 stub_insns[] = { - 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */ - 0x3d8c0000, /* addis r12,r12, */ - 0x398c0000, /* addi r12,r12, */ - 0x7d8903a6, /* mtctr r12 */ - 0x4e800420, /* bctr */ + PPC_RAW_LD(_R12, _R13, PACATOC), + PPC_RAW_ADDIS(_R12, _R12, 0), + PPC_RAW_ADDI(_R12, _R12, 0), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR() }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); -- cgit v1.2.3 From 0b46ac44f2673be2ee51bb52149cab3546ff1696 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 9 May 2022 12:34:44 +0300 Subject: termbits.h: create termbits-common.h for identical bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some defines are the same across all archs. Move the most obvious intersection to termbits-common.h. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220509093446.6677-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/uapi/asm/termbits.h | 52 +----------------------- arch/mips/include/uapi/asm/termbits.h | 53 +----------------------- arch/parisc/include/uapi/asm/termbits.h | 54 +------------------------ arch/powerpc/include/uapi/asm/termbits.h | 52 +----------------------- arch/sparc/include/uapi/asm/termbits.h | 53 +----------------------- include/uapi/asm-generic/termbits-common.h | 65 ++++++++++++++++++++++++++++++ include/uapi/asm-generic/termbits.h | 53 +----------------------- 7 files changed, 76 insertions(+), 306 deletions(-) create mode 100644 include/uapi/asm-generic/termbits-common.h (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/uapi/asm/termbits.h b/arch/alpha/include/uapi/asm/termbits.h index 30dc7ff777b8..3c7a9afc4333 100644 --- a/arch/alpha/include/uapi/asm/termbits.h +++ b/arch/alpha/include/uapi/asm/termbits.h @@ -4,8 +4,8 @@ #include -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include + typedef unsigned int tcflag_t; /* @@ -72,33 +72,17 @@ struct ktermios { #define VTIME 17 /* c_iflag bits */ -#define IGNBRK 0x00001 -#define BRKINT 0x00002 -#define IGNPAR 0x00004 -#define PARMRK 0x00008 -#define INPCK 0x00010 -#define ISTRIP 0x00020 -#define INLCR 0x00040 -#define IGNCR 0x00080 -#define ICRNL 0x00100 #define IXON 0x00200 #define IXOFF 0x00400 -#define IXANY 0x00800 #define IUCLC 0x01000 #define IMAXBEL 0x02000 #define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0x00001 #define ONLCR 0x00002 #define OLCUC 0x00004 -#define OCRNL 0x00008 -#define ONOCR 0x00010 -#define ONLRET 0x00020 -#define OFILL 0x000040 -#define OFDEL 0x000080 #define NLDLY 0x000300 #define NL0 0x000000 #define NL1 0x000100 @@ -131,24 +115,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000001f -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CBAUDEX 0x00000000 #define B57600 0x00000010 #define B115200 0x00000011 @@ -180,11 +146,8 @@ struct ktermios { #define HUPCL 0x00004000 #define CLOCAL 0x00008000 -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ #define CIBAUD 0x1f0000 -#define IBSHIFT 16 /* c_lflag bits */ #define ISIG 0x00000080 @@ -204,17 +167,6 @@ struct ktermios { #define IEXTEN 0x00000400 #define EXTPROC 0x10000000 -/* Values for the ACTION argument to `tcflow'. */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* Values for the QUEUE_SELECTOR argument to `tcflush'. */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - /* Values for the OPTIONAL_ACTIONS argument to `tcsetattr'. */ #define TCSANOW 0 #define TCSADRAIN 1 diff --git a/arch/mips/include/uapi/asm/termbits.h b/arch/mips/include/uapi/asm/termbits.h index d1315ccdb39a..b33f514315ce 100644 --- a/arch/mips/include/uapi/asm/termbits.h +++ b/arch/mips/include/uapi/asm/termbits.h @@ -13,8 +13,8 @@ #include -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include + typedef unsigned int tcflag_t; /* @@ -80,31 +80,15 @@ struct ktermios { #define VEOL 17 /* End-of-line character [ICANON]. */ /* c_iflag bits */ -#define IGNBRK 0x00001 /* Ignore break condition. */ -#define BRKINT 0x00002 /* Signal interrupt on break. */ -#define IGNPAR 0x00004 /* Ignore characters with parity errors. */ -#define PARMRK 0x00008 /* Mark parity and framing errors. */ -#define INPCK 0x00010 /* Enable input parity check. */ -#define ISTRIP 0x00020 /* Strip 8th bit off characters. */ -#define INLCR 0x00040 /* Map NL to CR on input. */ -#define IGNCR 0x00080 /* Ignore CR. */ -#define ICRNL 0x00100 /* Map CR to NL on input. */ #define IUCLC 0x00200 /* Map upper case to lower case on input. */ #define IXON 0x00400 /* Enable start/stop output control. */ -#define IXANY 0x00800 /* Any character will restart after stop. */ #define IXOFF 0x01000 /* Enable start/stop input control. */ #define IMAXBEL 0x02000 /* Ring bell when input queue is full. */ #define IUTF8 0x04000 /* Input is UTF-8 */ /* c_oflag bits */ -#define OPOST 0x00001 /* Perform output processing. */ #define OLCUC 0x00002 /* Map lower case to upper case on output. */ #define ONLCR 0x00004 /* Map NL to CR-NL on output. */ -#define OCRNL 0x00008 -#define ONOCR 0x00010 -#define ONLRET 0x00020 -#define OFILL 0x00040 -#define OFDEL 0x00080 #define NLDLY 0x00100 #define NL0 0x00000 #define NL1 0x00100 @@ -135,24 +119,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000100f -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CSIZE 0x00000030 /* Number of bits per byte (mask) */ #define CS5 0x00000000 /* 5 bits per byte */ #define CS6 0x00000010 /* 6 bits per byte */ @@ -182,10 +148,6 @@ struct ktermios { #define B3500000 0x0000100e #define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ #define ISIG 0x00001 /* Enable signals. */ @@ -209,17 +171,6 @@ struct ktermios { /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 /* Suspend output. */ -#define TCOON 1 /* Restart suspended output. */ -#define TCIOFF 2 /* Send a STOP character. */ -#define TCION 3 /* Send a START character. */ - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 /* Discard data received but not yet read. */ -#define TCOFLUSH 1 /* Discard data written but not yet sent. */ -#define TCIOFLUSH 2 /* Discard all pending data. */ - /* tcsetattr uses these */ #define TCSANOW TCSETS /* Change immediately. */ #define TCSADRAIN TCSETSW /* Change when pending output is written. */ diff --git a/arch/parisc/include/uapi/asm/termbits.h b/arch/parisc/include/uapi/asm/termbits.h index 6017ee08f099..7f74a822b7ea 100644 --- a/arch/parisc/include/uapi/asm/termbits.h +++ b/arch/parisc/include/uapi/asm/termbits.h @@ -4,8 +4,8 @@ #include -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include + typedef unsigned int tcflag_t; #define NCCS 19 @@ -61,31 +61,15 @@ struct ktermios { /* c_iflag bits */ -#define IGNBRK 0x00001 -#define BRKINT 0x00002 -#define IGNPAR 0x00004 -#define PARMRK 0x00008 -#define INPCK 0x00010 -#define ISTRIP 0x00020 -#define INLCR 0x00040 -#define IGNCR 0x00080 -#define ICRNL 0x00100 #define IUCLC 0x00200 #define IXON 0x00400 -#define IXANY 0x00800 #define IXOFF 0x01000 #define IMAXBEL 0x04000 #define IUTF8 0x08000 /* c_oflag bits */ -#define OPOST 0x00001 #define OLCUC 0x00002 #define ONLCR 0x00004 -#define OCRNL 0x00008 -#define ONOCR 0x00010 -#define ONLRET 0x00020 -#define OFILL 0x00040 -#define OFDEL 0x00080 #define NLDLY 0x00100 #define NL0 0x00000 #define NL1 0x00100 @@ -112,24 +96,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000100f -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CSIZE 0x00000030 #define CS5 0x00000000 #define CS6 0x00000010 @@ -159,11 +125,6 @@ struct ktermios { #define B3500000 0x0000100e #define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - /* c_lflag bits */ #define ISIG 0x00001 @@ -183,17 +144,6 @@ struct ktermios { #define IEXTEN 0x08000 #define EXTPROC 0x10000 -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - /* tcsetattr uses these */ #define TCSANOW 0 #define TCSADRAIN 1 diff --git a/arch/powerpc/include/uapi/asm/termbits.h b/arch/powerpc/include/uapi/asm/termbits.h index e4892f2d5592..19d1350a80bb 100644 --- a/arch/powerpc/include/uapi/asm/termbits.h +++ b/arch/powerpc/include/uapi/asm/termbits.h @@ -9,8 +9,8 @@ * 2 of the License, or (at your option) any later version. */ -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include + typedef unsigned int tcflag_t; /* @@ -64,33 +64,17 @@ struct ktermios { #define VDISCARD 16 /* c_iflag bits */ -#define IGNBRK 0x00001 -#define BRKINT 0x00002 -#define IGNPAR 0x00004 -#define PARMRK 0x00008 -#define INPCK 0x00010 -#define ISTRIP 0x00020 -#define INLCR 0x00040 -#define IGNCR 0x00080 -#define ICRNL 0x00100 #define IXON 0x00200 #define IXOFF 0x00400 -#define IXANY 0x00800 #define IUCLC 0x01000 #define IMAXBEL 0x02000 #define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0x00001 #define ONLCR 0x00002 #define OLCUC 0x00004 -#define OCRNL 0x00008 -#define ONOCR 0x00010 -#define ONLRET 0x00020 -#define OFILL 0x000040 -#define OFDEL 0x000080 #define NLDLY 0x000300 #define NL0 0x000000 #define NL1 0x000100 @@ -119,24 +103,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x000000ff -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CBAUDEX 0x00000000 #define B57600 0x00000010 #define B115200 0x00000011 @@ -156,7 +122,6 @@ struct ktermios { #define BOTHER 0x0000001f #define CIBAUD 0x00ff0000 -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ #define CSIZE 0x00000300 #define CS5 0x00000000 @@ -171,8 +136,6 @@ struct ktermios { #define HUPCL 0x00004000 #define CLOCAL 0x00008000 -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ /* c_lflag bits */ #define ISIG 0x00000080 @@ -192,17 +155,6 @@ struct ktermios { #define IEXTEN 0x00000400 #define EXTPROC 0x10000000 -/* Values for the ACTION argument to `tcflow'. */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* Values for the QUEUE_SELECTOR argument to `tcflush'. */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - /* Values for the OPTIONAL_ACTIONS argument to `tcsetattr'. */ #define TCSANOW 0 #define TCSADRAIN 1 diff --git a/arch/sparc/include/uapi/asm/termbits.h b/arch/sparc/include/uapi/asm/termbits.h index ce5ad5d0f105..854bdd153cca 100644 --- a/arch/sparc/include/uapi/asm/termbits.h +++ b/arch/sparc/include/uapi/asm/termbits.h @@ -4,8 +4,7 @@ #include -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include #if defined(__sparc__) && defined(__arch64__) typedef unsigned int tcflag_t; @@ -90,31 +89,15 @@ struct ktermios { #endif /* c_iflag bits */ -#define IGNBRK 0x00000001 -#define BRKINT 0x00000002 -#define IGNPAR 0x00000004 -#define PARMRK 0x00000008 -#define INPCK 0x00000010 -#define ISTRIP 0x00000020 -#define INLCR 0x00000040 -#define IGNCR 0x00000080 -#define ICRNL 0x00000100 #define IUCLC 0x00000200 #define IXON 0x00000400 -#define IXANY 0x00000800 #define IXOFF 0x00001000 #define IMAXBEL 0x00002000 #define IUTF8 0x00004000 /* c_oflag bits */ -#define OPOST 0x00000001 #define OLCUC 0x00000002 #define ONLCR 0x00000004 -#define OCRNL 0x00000008 -#define ONOCR 0x00000010 -#define ONLRET 0x00000020 -#define OFILL 0x00000040 -#define OFDEL 0x00000080 #define NLDLY 0x00000100 #define NL0 0x00000000 #define NL1 0x00000100 @@ -143,24 +126,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000100f -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CSIZE 0x00000030 #define CS5 0x00000000 #define CS6 0x00000010 @@ -201,10 +166,6 @@ struct ktermios { #define B3500000 0x00001012 #define B4000000 0x00001013 */ #define CIBAUD 0x100f0000 /* input baud rate (not used) */ -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ #define ISIG 0x00000001 @@ -244,18 +205,6 @@ struct ktermios { /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - /* tcsetattr uses these */ #define TCSANOW 0 #define TCSADRAIN 1 diff --git a/include/uapi/asm-generic/termbits-common.h b/include/uapi/asm-generic/termbits-common.h new file mode 100644 index 000000000000..4d084fe8def5 --- /dev/null +++ b/include/uapi/asm-generic/termbits-common.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_GENERIC_TERMBITS_COMMON_H +#define __ASM_GENERIC_TERMBITS_COMMON_H + +typedef unsigned char cc_t; +typedef unsigned int speed_t; + +/* c_iflag bits */ +#define IGNBRK 0x001 /* Ignore break condition */ +#define BRKINT 0x002 /* Signal interrupt on break */ +#define IGNPAR 0x004 /* Ignore characters with parity errors */ +#define PARMRK 0x008 /* Mark parity and framing errors */ +#define INPCK 0x010 /* Enable input parity check */ +#define ISTRIP 0x020 /* Strip 8th bit off characters */ +#define INLCR 0x040 /* Map NL to CR on input */ +#define IGNCR 0x080 /* Ignore CR */ +#define ICRNL 0x100 /* Map CR to NL on input */ +#define IXANY 0x800 /* Any character will restart after stop */ + +/* c_oflag bits */ +#define OPOST 0x01 /* Perform output processing */ +#define OCRNL 0x08 +#define ONOCR 0x10 +#define ONLRET 0x20 +#define OFILL 0x40 +#define OFDEL 0x80 + +/* c_cflag bit meaning */ +/* Common CBAUD rates */ +#define B0 0x00000000 /* hang up */ +#define B50 0x00000001 +#define B75 0x00000002 +#define B110 0x00000003 +#define B134 0x00000004 +#define B150 0x00000005 +#define B200 0x00000006 +#define B300 0x00000007 +#define B600 0x00000008 +#define B1200 0x00000009 +#define B1800 0x0000000a +#define B2400 0x0000000b +#define B4800 0x0000000c +#define B9600 0x0000000d +#define B19200 0x0000000e +#define B38400 0x0000000f +#define EXTA B19200 +#define EXTB B38400 + +#define CMSPAR 0x40000000 /* mark or space (stick) parity */ +#define CRTSCTS 0x80000000 /* flow control */ + +#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ + +/* tcflow() ACTION argument and TCXONC use these */ +#define TCOOFF 0 /* Suspend output */ +#define TCOON 1 /* Restart suspended output */ +#define TCIOFF 2 /* Send a STOP character */ +#define TCION 3 /* Send a START character */ + +/* tcflush() QUEUE_SELECTOR argument and TCFLSH use these */ +#define TCIFLUSH 0 /* Discard data received but not yet read */ +#define TCOFLUSH 1 /* Discard data written but not yet sent */ +#define TCIOFLUSH 2 /* Discard all pending data */ + +#endif /* __ASM_GENERIC_TERMBITS_COMMON_H */ diff --git a/include/uapi/asm-generic/termbits.h b/include/uapi/asm-generic/termbits.h index 470fd673ff84..78a85c326eef 100644 --- a/include/uapi/asm-generic/termbits.h +++ b/include/uapi/asm-generic/termbits.h @@ -4,8 +4,8 @@ #include -typedef unsigned char cc_t; -typedef unsigned int speed_t; +#include + typedef unsigned int tcflag_t; #define NCCS 19 @@ -60,31 +60,15 @@ struct ktermios { #define VEOL2 16 /* c_iflag bits */ -#define IGNBRK 0x00001 -#define BRKINT 0x00002 -#define IGNPAR 0x00004 -#define PARMRK 0x00008 -#define INPCK 0x00010 -#define ISTRIP 0x00020 -#define INLCR 0x00040 -#define IGNCR 0x00080 -#define ICRNL 0x00100 #define IUCLC 0x00200 #define IXON 0x00400 -#define IXANY 0x00800 #define IXOFF 0x01000 #define IMAXBEL 0x02000 #define IUTF8 0x04000 /* c_oflag bits */ -#define OPOST 0x00001 #define OLCUC 0x00002 #define ONLCR 0x00004 -#define OCRNL 0x00008 -#define ONOCR 0x00010 -#define ONLRET 0x00020 -#define OFILL 0x00040 -#define OFDEL 0x00080 #define NLDLY 0x00100 #define NL0 0x00000 #define NL1 0x00100 @@ -111,24 +95,6 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000100f -#define B0 0x00000000 /* hang up */ -#define B50 0x00000001 -#define B75 0x00000002 -#define B110 0x00000003 -#define B134 0x00000004 -#define B150 0x00000005 -#define B200 0x00000006 -#define B300 0x00000007 -#define B600 0x00000008 -#define B1200 0x00000009 -#define B1800 0x0000000a -#define B2400 0x0000000b -#define B4800 0x0000000c -#define B9600 0x0000000d -#define B19200 0x0000000e -#define B38400 0x0000000f -#define EXTA B19200 -#define EXTB B38400 #define CSIZE 0x00000030 #define CS5 0x00000000 #define CS6 0x00000010 @@ -158,10 +124,6 @@ struct ktermios { #define B3500000 0x0000100e #define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ -#define CMSPAR 0x40000000 /* mark or space (stick) parity */ -#define CRTSCTS 0x80000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ /* c_lflag bits */ #define ISIG 0x00001 @@ -181,17 +143,6 @@ struct ktermios { #define IEXTEN 0x08000 #define EXTPROC 0x10000 -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - /* tcsetattr uses these */ #define TCSANOW 0 #define TCSADRAIN 1 -- cgit v1.2.3 From c9b34088e80efe30459517fa3834cf78532c9a02 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 9 May 2022 12:34:45 +0300 Subject: termbits.h: Align lines & format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Align c_cc defines. - Remove extra newlines. - Realign & adjust number of leading zeros. - Reorder c_cflag defines to ascending order - Make comment ending shorted (=remove period and one extra space from the comments in mips). Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220509093446.6677-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/uapi/asm/termbits.h | 130 ++++++++++++------------ arch/mips/include/uapi/asm/termbits.h | 156 ++++++++++++++-------------- arch/parisc/include/uapi/asm/termbits.h | 77 +++++++------- arch/powerpc/include/uapi/asm/termbits.h | 100 +++++++++--------- arch/sparc/include/uapi/asm/termbits.h | 168 +++++++++++++++---------------- include/uapi/asm-generic/termbits.h | 76 +++++++------- 6 files changed, 346 insertions(+), 361 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/uapi/asm/termbits.h b/arch/alpha/include/uapi/asm/termbits.h index 3c7a9afc4333..735e9ffe2795 100644 --- a/arch/alpha/include/uapi/asm/termbits.h +++ b/arch/alpha/include/uapi/asm/termbits.h @@ -53,60 +53,58 @@ struct ktermios { }; /* c_cc characters */ -#define VEOF 0 -#define VEOL 1 -#define VEOL2 2 -#define VERASE 3 -#define VWERASE 4 -#define VKILL 5 -#define VREPRINT 6 -#define VSWTC 7 -#define VINTR 8 -#define VQUIT 9 -#define VSUSP 10 -#define VSTART 12 -#define VSTOP 13 -#define VLNEXT 14 -#define VDISCARD 15 -#define VMIN 16 -#define VTIME 17 +#define VEOF 0 +#define VEOL 1 +#define VEOL2 2 +#define VERASE 3 +#define VWERASE 4 +#define VKILL 5 +#define VREPRINT 6 +#define VSWTC 7 +#define VINTR 8 +#define VQUIT 9 +#define VSUSP 10 +#define VSTART 12 +#define VSTOP 13 +#define VLNEXT 14 +#define VDISCARD 15 +#define VMIN 16 +#define VTIME 17 /* c_iflag bits */ -#define IXON 0x00200 -#define IXOFF 0x00400 -#define IUCLC 0x01000 -#define IMAXBEL 0x02000 -#define IUTF8 0x04000 +#define IXON 0x0200 +#define IXOFF 0x0400 +#define IUCLC 0x1000 +#define IMAXBEL 0x2000 +#define IUTF8 0x4000 /* c_oflag bits */ #define ONLCR 0x00002 #define OLCUC 0x00004 - - -#define NLDLY 0x000300 -#define NL0 0x000000 -#define NL1 0x000100 -#define NL2 0x000200 -#define NL3 0x000300 -#define TABDLY 0x000c00 -#define TAB0 0x000000 -#define TAB1 0x000400 -#define TAB2 0x000800 -#define TAB3 0x000c00 -#define CRDLY 0x003000 -#define CR0 0x000000 -#define CR1 0x001000 -#define CR2 0x002000 -#define CR3 0x003000 -#define FFDLY 0x004000 -#define FF0 0x000000 -#define FF1 0x004000 -#define BSDLY 0x008000 -#define BS0 0x000000 -#define BS1 0x008000 -#define VTDLY 0x010000 -#define VT0 0x000000 -#define VT1 0x010000 +#define NLDLY 0x00300 +#define NL0 0x00000 +#define NL1 0x00100 +#define NL2 0x00200 +#define NL3 0x00300 +#define TABDLY 0x00c00 +#define TAB0 0x00000 +#define TAB1 0x00400 +#define TAB2 0x00800 +#define TAB3 0x00c00 +#define CRDLY 0x03000 +#define CR0 0x00000 +#define CR1 0x01000 +#define CR2 0x02000 +#define CR3 0x03000 +#define FFDLY 0x04000 +#define FF0 0x00000 +#define FF1 0x04000 +#define BSDLY 0x08000 +#define BS0 0x00000 +#define BS1 0x08000 +#define VTDLY 0x10000 +#define VT0 0x00000 +#define VT1 0x10000 /* * Should be equivalent to TAB3, see description of TAB3 in * POSIX.1-2008, Ch. 11.2.3 "Output Modes" @@ -116,38 +114,34 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000001f #define CBAUDEX 0x00000000 -#define B57600 0x00000010 -#define B115200 0x00000011 -#define B230400 0x00000012 -#define B460800 0x00000013 -#define B500000 0x00000014 -#define B576000 0x00000015 -#define B921600 0x00000016 -#define B1000000 0x00000017 -#define B1152000 0x00000018 -#define B1500000 0x00000019 -#define B2000000 0x0000001a -#define B2500000 0x0000001b -#define B3000000 0x0000001c -#define B3500000 0x0000001d -#define B4000000 0x0000001e #define BOTHER 0x0000001f - +#define B57600 0x00000010 +#define B115200 0x00000011 +#define B230400 0x00000012 +#define B460800 0x00000013 +#define B500000 0x00000014 +#define B576000 0x00000015 +#define B921600 0x00000016 +#define B1000000 0x00000017 +#define B1152000 0x00000018 +#define B1500000 0x00000019 +#define B2000000 0x0000001a +#define B2500000 0x0000001b +#define B3000000 0x0000001c +#define B3500000 0x0000001d +#define B4000000 0x0000001e #define CSIZE 0x00000300 #define CS5 0x00000000 #define CS6 0x00000100 #define CS7 0x00000200 #define CS8 0x00000300 - #define CSTOPB 0x00000400 #define CREAD 0x00000800 #define PARENB 0x00001000 #define PARODD 0x00002000 #define HUPCL 0x00004000 - #define CLOCAL 0x00008000 - -#define CIBAUD 0x1f0000 +#define CIBAUD 0x001f0000 /* c_lflag bits */ #define ISIG 0x00000080 diff --git a/arch/mips/include/uapi/asm/termbits.h b/arch/mips/include/uapi/asm/termbits.h index b33f514315ce..8fa3e79d4f94 100644 --- a/arch/mips/include/uapi/asm/termbits.h +++ b/arch/mips/include/uapi/asm/termbits.h @@ -15,7 +15,7 @@ #include -typedef unsigned int tcflag_t; +typedef unsigned int tcflag_t; /* * The ABI says nothing about NCC but seems to use NCCS as @@ -54,64 +54,64 @@ struct ktermios { }; /* c_cc characters */ -#define VINTR 0 /* Interrupt character [ISIG]. */ -#define VQUIT 1 /* Quit character [ISIG]. */ -#define VERASE 2 /* Erase character [ICANON]. */ -#define VKILL 3 /* Kill-line character [ICANON]. */ -#define VMIN 4 /* Minimum number of bytes read at once [!ICANON]. */ -#define VTIME 5 /* Time-out value (tenths of a second) [!ICANON]. */ -#define VEOL2 6 /* Second EOL character [ICANON]. */ +#define VINTR 0 /* Interrupt character [ISIG] */ +#define VQUIT 1 /* Quit character [ISIG] */ +#define VERASE 2 /* Erase character [ICANON] */ +#define VKILL 3 /* Kill-line character [ICANON] */ +#define VMIN 4 /* Minimum number of bytes read at once [!ICANON] */ +#define VTIME 5 /* Time-out value (tenths of a second) [!ICANON] */ +#define VEOL2 6 /* Second EOL character [ICANON] */ #define VSWTC 7 /* ??? */ #define VSWTCH VSWTC -#define VSTART 8 /* Start (X-ON) character [IXON, IXOFF]. */ -#define VSTOP 9 /* Stop (X-OFF) character [IXON, IXOFF]. */ -#define VSUSP 10 /* Suspend character [ISIG]. */ +#define VSTART 8 /* Start (X-ON) character [IXON, IXOFF] */ +#define VSTOP 9 /* Stop (X-OFF) character [IXON, IXOFF] */ +#define VSUSP 10 /* Suspend character [ISIG] */ #if 0 /* * VDSUSP is not supported */ -#define VDSUSP 11 /* Delayed suspend character [ISIG]. */ +#define VDSUSP 11 /* Delayed suspend character [ISIG] */ #endif -#define VREPRINT 12 /* Reprint-line character [ICANON]. */ -#define VDISCARD 13 /* Discard character [IEXTEN]. */ -#define VWERASE 14 /* Word-erase character [ICANON]. */ -#define VLNEXT 15 /* Literal-next character [IEXTEN]. */ -#define VEOF 16 /* End-of-file character [ICANON]. */ -#define VEOL 17 /* End-of-line character [ICANON]. */ +#define VREPRINT 12 /* Reprint-line character [ICANON] */ +#define VDISCARD 13 /* Discard character [IEXTEN] */ +#define VWERASE 14 /* Word-erase character [ICANON] */ +#define VLNEXT 15 /* Literal-next character [IEXTEN] */ +#define VEOF 16 /* End-of-file character [ICANON] */ +#define VEOL 17 /* End-of-line character [ICANON] */ /* c_iflag bits */ -#define IUCLC 0x00200 /* Map upper case to lower case on input. */ -#define IXON 0x00400 /* Enable start/stop output control. */ -#define IXOFF 0x01000 /* Enable start/stop input control. */ -#define IMAXBEL 0x02000 /* Ring bell when input queue is full. */ -#define IUTF8 0x04000 /* Input is UTF-8 */ +#define IUCLC 0x0200 /* Map upper case to lower case on input */ +#define IXON 0x0400 /* Enable start/stop output control */ +#define IXOFF 0x1000 /* Enable start/stop input control */ +#define IMAXBEL 0x2000 /* Ring bell when input queue is full */ +#define IUTF8 0x4000 /* Input is UTF-8 */ /* c_oflag bits */ -#define OLCUC 0x00002 /* Map lower case to upper case on output. */ -#define ONLCR 0x00004 /* Map NL to CR-NL on output. */ +#define OLCUC 0x00002 /* Map lower case to upper case on output */ +#define ONLCR 0x00004 /* Map NL to CR-NL on output */ #define NLDLY 0x00100 -#define NL0 0x00000 -#define NL1 0x00100 +#define NL0 0x00000 +#define NL1 0x00100 #define CRDLY 0x00600 -#define CR0 0x00000 -#define CR1 0x00200 -#define CR2 0x00400 -#define CR3 0x00600 +#define CR0 0x00000 +#define CR1 0x00200 +#define CR2 0x00400 +#define CR3 0x00600 #define TABDLY 0x01800 -#define TAB0 0x00000 -#define TAB1 0x00800 -#define TAB2 0x01000 -#define TAB3 0x01800 -#define XTABS 0x01800 +#define TAB0 0x00000 +#define TAB1 0x00800 +#define TAB2 0x01000 +#define TAB3 0x01800 +#define XTABS 0x01800 #define BSDLY 0x02000 -#define BS0 0x00000 -#define BS1 0x02000 +#define BS0 0x00000 +#define BS1 0x02000 #define VTDLY 0x04000 -#define VT0 0x00000 -#define VT1 0x04000 +#define VT0 0x00000 +#define VT1 0x04000 #define FFDLY 0x08000 -#define FF0 0x00000 -#define FF1 0x08000 +#define FF0 0x00000 +#define FF1 0x08000 /* #define PAGEOUT ??? #define WRAP ??? @@ -120,10 +120,10 @@ struct ktermios { /* c_cflag bit meaning */ #define CBAUD 0x0000100f #define CSIZE 0x00000030 /* Number of bits per byte (mask) */ -#define CS5 0x00000000 /* 5 bits per byte */ -#define CS6 0x00000010 /* 6 bits per byte */ -#define CS7 0x00000020 /* 7 bits per byte */ -#define CS8 0x00000030 /* 8 bits per byte */ +#define CS5 0x00000000 /* 5 bits per byte */ +#define CS6 0x00000010 /* 6 bits per byte */ +#define CS7 0x00000020 /* 7 bits per byte */ +#define CS8 0x00000030 /* 8 bits per byte */ #define CSTOPB 0x00000040 /* Two stop bits instead of one */ #define CREAD 0x00000080 /* Enable receiver */ #define PARENB 0x00000100 /* Parity enable */ @@ -131,40 +131,40 @@ struct ktermios { #define HUPCL 0x00000400 /* Hang up on last close */ #define CLOCAL 0x00000800 /* Ignore modem status lines */ #define CBAUDEX 0x00001000 -#define BOTHER 0x00001000 -#define B57600 0x00001001 -#define B115200 0x00001002 -#define B230400 0x00001003 -#define B460800 0x00001004 -#define B500000 0x00001005 -#define B576000 0x00001006 -#define B921600 0x00001007 -#define B1000000 0x00001008 -#define B1152000 0x00001009 -#define B1500000 0x0000100a -#define B2000000 0x0000100b -#define B2500000 0x0000100c -#define B3000000 0x0000100d -#define B3500000 0x0000100e -#define B4000000 0x0000100f +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ /* c_lflag bits */ -#define ISIG 0x00001 /* Enable signals. */ -#define ICANON 0x00002 /* Do erase and kill processing. */ +#define ISIG 0x00001 /* Enable signals */ +#define ICANON 0x00002 /* Do erase and kill processing */ #define XCASE 0x00004 -#define ECHO 0x00008 /* Enable echo. */ -#define ECHOE 0x00010 /* Visual erase for ERASE. */ -#define ECHOK 0x00020 /* Echo NL after KILL. */ -#define ECHONL 0x00040 /* Echo NL even if ECHO is off. */ -#define NOFLSH 0x00080 /* Disable flush after interrupt. */ -#define IEXTEN 0x00100 /* Enable DISCARD and LNEXT. */ -#define ECHOCTL 0x00200 /* Echo control characters as ^X. */ -#define ECHOPRT 0x00400 /* Hardcopy visual erase. */ -#define ECHOKE 0x00800 /* Visual erase for KILL. */ +#define ECHO 0x00008 /* Enable echo */ +#define ECHOE 0x00010 /* Visual erase for ERASE */ +#define ECHOK 0x00020 /* Echo NL after KILL */ +#define ECHONL 0x00040 /* Echo NL even if ECHO is off */ +#define NOFLSH 0x00080 /* Disable flush after interrupt */ +#define IEXTEN 0x00100 /* Enable DISCARD and LNEXT */ +#define ECHOCTL 0x00200 /* Echo control characters as ^X */ +#define ECHOPRT 0x00400 /* Hardcopy visual erase */ +#define ECHOKE 0x00800 /* Visual erase for KILL */ #define FLUSHO 0x02000 -#define PENDIN 0x04000 /* Retype pending input (state). */ -#define TOSTOP 0x08000 /* Send SIGTTOU for background output. */ +#define PENDIN 0x04000 /* Retype pending input (state) */ +#define TOSTOP 0x08000 /* Send SIGTTOU for background output */ #define ITOSTOP TOSTOP #define EXTPROC 0x10000 /* External processing on pty */ @@ -172,8 +172,8 @@ struct ktermios { #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ /* tcsetattr uses these */ -#define TCSANOW TCSETS /* Change immediately. */ -#define TCSADRAIN TCSETSW /* Change when pending output is written. */ -#define TCSAFLUSH TCSETSF /* Flush pending input before changing. */ +#define TCSANOW TCSETS /* Change immediately */ +#define TCSADRAIN TCSETSW /* Change when pending output is written */ +#define TCSAFLUSH TCSETSF /* Flush pending input before changing */ #endif /* _ASM_TERMBITS_H */ diff --git a/arch/parisc/include/uapi/asm/termbits.h b/arch/parisc/include/uapi/asm/termbits.h index 7f74a822b7ea..d72c5ebf3a3a 100644 --- a/arch/parisc/include/uapi/asm/termbits.h +++ b/arch/parisc/include/uapi/asm/termbits.h @@ -41,31 +41,30 @@ struct ktermios { }; /* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - +#define VINTR 0 +#define VQUIT 1 +#define VERASE 2 +#define VKILL 3 +#define VEOF 4 +#define VTIME 5 +#define VMIN 6 +#define VSWTC 7 +#define VSTART 8 +#define VSTOP 9 +#define VSUSP 10 +#define VEOL 11 +#define VREPRINT 12 +#define VDISCARD 13 +#define VWERASE 14 +#define VLNEXT 15 +#define VEOL2 16 /* c_iflag bits */ -#define IUCLC 0x00200 -#define IXON 0x00400 -#define IXOFF 0x01000 -#define IMAXBEL 0x04000 -#define IUTF8 0x08000 +#define IUCLC 0x0200 +#define IXON 0x0400 +#define IXOFF 0x1000 +#define IMAXBEL 0x4000 +#define IUTF8 0x8000 /* c_oflag bits */ #define OLCUC 0x00002 @@ -108,22 +107,22 @@ struct ktermios { #define HUPCL 0x00000400 #define CLOCAL 0x00000800 #define CBAUDEX 0x00001000 -#define BOTHER 0x00001000 -#define B57600 0x00001001 -#define B115200 0x00001002 -#define B230400 0x00001003 -#define B460800 0x00001004 -#define B500000 0x00001005 -#define B576000 0x00001006 -#define B921600 0x00001007 -#define B1000000 0x00001008 -#define B1152000 0x00001009 -#define B1500000 0x0000100a -#define B2000000 0x0000100b -#define B2500000 0x0000100c -#define B3000000 0x0000100d -#define B3500000 0x0000100e -#define B4000000 0x0000100f +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ /* c_lflag bits */ diff --git a/arch/powerpc/include/uapi/asm/termbits.h b/arch/powerpc/include/uapi/asm/termbits.h index 19d1350a80bb..21dc86dcb2f1 100644 --- a/arch/powerpc/include/uapi/asm/termbits.h +++ b/arch/powerpc/include/uapi/asm/termbits.h @@ -64,78 +64,72 @@ struct ktermios { #define VDISCARD 16 /* c_iflag bits */ -#define IXON 0x00200 -#define IXOFF 0x00400 -#define IUCLC 0x01000 -#define IMAXBEL 0x02000 -#define IUTF8 0x04000 +#define IXON 0x0200 +#define IXOFF 0x0400 +#define IUCLC 0x1000 +#define IMAXBEL 0x2000 +#define IUTF8 0x4000 /* c_oflag bits */ #define ONLCR 0x00002 #define OLCUC 0x00004 - - -#define NLDLY 0x000300 -#define NL0 0x000000 -#define NL1 0x000100 -#define NL2 0x000200 -#define NL3 0x000300 -#define TABDLY 0x000c00 -#define TAB0 0x000000 -#define TAB1 0x000400 -#define TAB2 0x000800 -#define TAB3 0x000c00 -#define XTABS 0x000c00 /* required by POSIX to == TAB3 */ -#define CRDLY 0x003000 -#define CR0 0x000000 -#define CR1 0x001000 -#define CR2 0x002000 -#define CR3 0x003000 -#define FFDLY 0x004000 -#define FF0 0x000000 -#define FF1 0x004000 -#define BSDLY 0x008000 -#define BS0 0x000000 -#define BS1 0x008000 -#define VTDLY 0x010000 -#define VT0 0x000000 -#define VT1 0x010000 +#define NLDLY 0x00300 +#define NL0 0x00000 +#define NL1 0x00100 +#define NL2 0x00200 +#define NL3 0x00300 +#define TABDLY 0x00c00 +#define TAB0 0x00000 +#define TAB1 0x00400 +#define TAB2 0x00800 +#define TAB3 0x00c00 +#define XTABS 0x00c00 /* required by POSIX to == TAB3 */ +#define CRDLY 0x03000 +#define CR0 0x00000 +#define CR1 0x01000 +#define CR2 0x02000 +#define CR3 0x03000 +#define FFDLY 0x04000 +#define FF0 0x00000 +#define FF1 0x04000 +#define BSDLY 0x08000 +#define BS0 0x00000 +#define BS1 0x08000 +#define VTDLY 0x10000 +#define VT0 0x00000 +#define VT1 0x10000 /* c_cflag bit meaning */ #define CBAUD 0x000000ff #define CBAUDEX 0x00000000 -#define B57600 0x00000010 -#define B115200 0x00000011 -#define B230400 0x00000012 -#define B460800 0x00000013 -#define B500000 0x00000014 -#define B576000 0x00000015 -#define B921600 0x00000016 -#define B1000000 0x00000017 -#define B1152000 0x00000018 -#define B1500000 0x00000019 -#define B2000000 0x0000001a -#define B2500000 0x0000001b -#define B3000000 0x0000001c -#define B3500000 0x0000001d -#define B4000000 0x0000001e -#define BOTHER 0x0000001f - -#define CIBAUD 0x00ff0000 - +#define BOTHER 0x0000001f +#define B57600 0x00000010 +#define B115200 0x00000011 +#define B230400 0x00000012 +#define B460800 0x00000013 +#define B500000 0x00000014 +#define B576000 0x00000015 +#define B921600 0x00000016 +#define B1000000 0x00000017 +#define B1152000 0x00000018 +#define B1500000 0x00000019 +#define B2000000 0x0000001a +#define B2500000 0x0000001b +#define B3000000 0x0000001c +#define B3500000 0x0000001d +#define B4000000 0x0000001e #define CSIZE 0x00000300 #define CS5 0x00000000 #define CS6 0x00000100 #define CS7 0x00000200 #define CS8 0x00000300 - #define CSTOPB 0x00000400 #define CREAD 0x00000800 #define PARENB 0x00001000 #define PARODD 0x00002000 #define HUPCL 0x00004000 - #define CLOCAL 0x00008000 +#define CIBAUD 0x00ff0000 /* c_lflag bits */ #define ISIG 0x00000080 diff --git a/arch/sparc/include/uapi/asm/termbits.h b/arch/sparc/include/uapi/asm/termbits.h index 854bdd153cca..cfcc4e07ce51 100644 --- a/arch/sparc/include/uapi/asm/termbits.h +++ b/arch/sparc/include/uapi/asm/termbits.h @@ -7,9 +7,9 @@ #include #if defined(__sparc__) && defined(__arch64__) -typedef unsigned int tcflag_t; +typedef unsigned int tcflag_t; #else -typedef unsigned long tcflag_t; +typedef unsigned long tcflag_t; #endif #define NCC 8 @@ -60,21 +60,19 @@ struct ktermios { }; /* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VEOL 5 -#define VEOL2 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 - - +#define VINTR 0 +#define VQUIT 1 +#define VERASE 2 +#define VKILL 3 +#define VEOF 4 +#define VEOL 5 +#define VEOL2 6 +#define VSWTC 7 +#define VSTART 8 +#define VSTOP 9 #define VSUSP 10 -#define VDSUSP 11 /* SunOS POSIX nicety I do believe... */ +#define VDSUSP 11 /* SunOS POSIX nicety I do believe... */ #define VREPRINT 12 #define VDISCARD 13 #define VWERASE 14 @@ -89,83 +87,83 @@ struct ktermios { #endif /* c_iflag bits */ -#define IUCLC 0x00000200 -#define IXON 0x00000400 -#define IXOFF 0x00001000 -#define IMAXBEL 0x00002000 -#define IUTF8 0x00004000 +#define IUCLC 0x0200 +#define IXON 0x0400 +#define IXOFF 0x1000 +#define IMAXBEL 0x2000 +#define IUTF8 0x4000 /* c_oflag bits */ -#define OLCUC 0x00000002 -#define ONLCR 0x00000004 -#define NLDLY 0x00000100 -#define NL0 0x00000000 -#define NL1 0x00000100 -#define CRDLY 0x00000600 -#define CR0 0x00000000 -#define CR1 0x00000200 -#define CR2 0x00000400 -#define CR3 0x00000600 -#define TABDLY 0x00001800 -#define TAB0 0x00000000 -#define TAB1 0x00000800 -#define TAB2 0x00001000 -#define TAB3 0x00001800 -#define XTABS 0x00001800 -#define BSDLY 0x00002000 -#define BS0 0x00000000 -#define BS1 0x00002000 -#define VTDLY 0x00004000 -#define VT0 0x00000000 -#define VT1 0x00004000 -#define FFDLY 0x00008000 -#define FF0 0x00000000 -#define FF1 0x00008000 -#define PAGEOUT 0x00010000 /* SUNOS specific */ -#define WRAP 0x00020000 /* SUNOS specific */ +#define OLCUC 0x00002 +#define ONLCR 0x00004 +#define NLDLY 0x00100 +#define NL0 0x00000 +#define NL1 0x00100 +#define CRDLY 0x00600 +#define CR0 0x00000 +#define CR1 0x00200 +#define CR2 0x00400 +#define CR3 0x00600 +#define TABDLY 0x01800 +#define TAB0 0x00000 +#define TAB1 0x00800 +#define TAB2 0x01000 +#define TAB3 0x01800 +#define XTABS 0x01800 +#define BSDLY 0x02000 +#define BS0 0x00000 +#define BS1 0x02000 +#define VTDLY 0x04000 +#define VT0 0x00000 +#define VT1 0x04000 +#define FFDLY 0x08000 +#define FF0 0x00000 +#define FF1 0x08000 +#define PAGEOUT 0x10000 /* SUNOS specific */ +#define WRAP 0x20000 /* SUNOS specific */ /* c_cflag bit meaning */ -#define CBAUD 0x0000100f -#define CSIZE 0x00000030 -#define CS5 0x00000000 -#define CS6 0x00000010 -#define CS7 0x00000020 -#define CS8 0x00000030 -#define CSTOPB 0x00000040 -#define CREAD 0x00000080 -#define PARENB 0x00000100 -#define PARODD 0x00000200 -#define HUPCL 0x00000400 -#define CLOCAL 0x00000800 -#define CBAUDEX 0x00001000 +#define CBAUD 0x0000100f +#define CSIZE 0x00000030 +#define CS5 0x00000000 +#define CS6 0x00000010 +#define CS7 0x00000020 +#define CS8 0x00000030 +#define CSTOPB 0x00000040 +#define CREAD 0x00000080 +#define PARENB 0x00000100 +#define PARODD 0x00000200 +#define HUPCL 0x00000400 +#define CLOCAL 0x00000800 +#define CBAUDEX 0x00001000 /* We'll never see these speeds with the Zilogs, but for completeness... */ -#define BOTHER 0x00001000 -#define B57600 0x00001001 -#define B115200 0x00001002 -#define B230400 0x00001003 -#define B460800 0x00001004 +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 /* This is what we can do with the Zilogs. */ -#define B76800 0x00001005 +#define B76800 0x00001005 /* This is what we can do with the SAB82532. */ -#define B153600 0x00001006 -#define B307200 0x00001007 -#define B614400 0x00001008 -#define B921600 0x00001009 +#define B153600 0x00001006 +#define B307200 0x00001007 +#define B614400 0x00001008 +#define B921600 0x00001009 /* And these are the rest... */ -#define B500000 0x0000100a -#define B576000 0x0000100b -#define B1000000 0x0000100c -#define B1152000 0x0000100d -#define B1500000 0x0000100e -#define B2000000 0x0000100f +#define B500000 0x0000100a +#define B576000 0x0000100b +#define B1000000 0x0000100c +#define B1152000 0x0000100d +#define B1500000 0x0000100e +#define B2000000 0x0000100f /* These have totally bogus values and nobody uses them so far. Later on we'd have to use say 0x10000x and adjust CBAUD constant and drivers accordingly. -#define B2500000 0x00001010 -#define B3000000 0x00001011 -#define B3500000 0x00001012 -#define B4000000 0x00001013 */ -#define CIBAUD 0x100f0000 /* input baud rate (not used) */ +#define B2500000 0x00001010 +#define B3000000 0x00001011 +#define B3500000 0x00001012 +#define B4000000 0x00001013 */ +#define CIBAUD 0x100f0000 /* input baud rate (not used) */ /* c_lflag bits */ #define ISIG 0x00000001 @@ -180,7 +178,7 @@ struct ktermios { #define ECHOCTL 0x00000200 #define ECHOPRT 0x00000400 #define ECHOKE 0x00000800 -#define DEFECHO 0x00001000 /* SUNOS thing, what is it? */ +#define DEFECHO 0x00001000 /* SUNOS thing, what is it? */ #define FLUSHO 0x00002000 #define PENDIN 0x00004000 #define IEXTEN 0x00008000 @@ -206,8 +204,8 @@ struct ktermios { #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ /* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 +#define TCSANOW 0 +#define TCSADRAIN 1 +#define TCSAFLUSH 2 #endif /* _UAPI_SPARC_TERMBITS_H */ diff --git a/include/uapi/asm-generic/termbits.h b/include/uapi/asm-generic/termbits.h index 78a85c326eef..c92179563289 100644 --- a/include/uapi/asm-generic/termbits.h +++ b/include/uapi/asm-generic/termbits.h @@ -41,30 +41,30 @@ struct ktermios { }; /* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 +#define VINTR 0 +#define VQUIT 1 +#define VERASE 2 +#define VKILL 3 +#define VEOF 4 +#define VTIME 5 +#define VMIN 6 +#define VSWTC 7 +#define VSTART 8 +#define VSTOP 9 +#define VSUSP 10 +#define VEOL 11 +#define VREPRINT 12 +#define VDISCARD 13 +#define VWERASE 14 +#define VLNEXT 15 +#define VEOL2 16 /* c_iflag bits */ -#define IUCLC 0x00200 -#define IXON 0x00400 -#define IXOFF 0x01000 -#define IMAXBEL 0x02000 -#define IUTF8 0x04000 +#define IUCLC 0x0200 +#define IXON 0x0400 +#define IXOFF 0x1000 +#define IMAXBEL 0x2000 +#define IUTF8 0x4000 /* c_oflag bits */ #define OLCUC 0x00002 @@ -107,22 +107,22 @@ struct ktermios { #define HUPCL 0x00000400 #define CLOCAL 0x00000800 #define CBAUDEX 0x00001000 -#define BOTHER 0x00001000 -#define B57600 0x00001001 -#define B115200 0x00001002 -#define B230400 0x00001003 -#define B460800 0x00001004 -#define B500000 0x00001005 -#define B576000 0x00001006 -#define B921600 0x00001007 -#define B1000000 0x00001008 -#define B1152000 0x00001009 -#define B1500000 0x0000100a -#define B2000000 0x0000100b -#define B2500000 0x0000100c -#define B3000000 0x0000100d -#define B3500000 0x0000100e -#define B4000000 0x0000100f +#define BOTHER 0x00001000 +#define B57600 0x00001001 +#define B115200 0x00001002 +#define B230400 0x00001003 +#define B460800 0x00001004 +#define B500000 0x00001005 +#define B576000 0x00001006 +#define B921600 0x00001007 +#define B1000000 0x00001008 +#define B1152000 0x00001009 +#define B1500000 0x0000100a +#define B2000000 0x0000100b +#define B2500000 0x0000100c +#define B3000000 0x0000100d +#define B3500000 0x0000100e +#define B4000000 0x0000100f #define CIBAUD 0x100f0000 /* input baud rate */ /* c_lflag bits */ -- cgit v1.2.3 From 69505e3d9a39a988aaed9b58aa6b3482238f6516 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 12 May 2022 06:56:23 -0700 Subject: bug: Use normal relative pointers in 'struct bug_entry' With CONFIG_GENERIC_BUG_RELATIVE_POINTERS, the addr/file relative pointers are calculated weirdly: based on the beginning of the bug_entry struct address, rather than their respective pointer addresses. Make the relative pointers less surprising to both humans and tools by calculating them the normal way. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Acked-by: Peter Zijlstra (Intel) Acked-by: Sven Schnelle # s390 Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/f0e05be797a16f4fc2401eeb88c8450dcbe61df6.1652362951.git.jpoimboe@kernel.org --- arch/arm64/include/asm/asm-bug.h | 4 ++-- arch/powerpc/include/asm/bug.h | 14 ++++++++------ arch/riscv/include/asm/bug.h | 4 ++-- arch/s390/include/asm/bug.h | 5 +++-- arch/x86/include/asm/bug.h | 2 +- lib/bug.c | 15 +++++++-------- 6 files changed, 23 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h index 03f52f84a4f3..c762038ba400 100644 --- a/arch/arm64/include/asm/asm-bug.h +++ b/arch/arm64/include/asm/asm-bug.h @@ -14,7 +14,7 @@ 14472: .string file; \ .popsection; \ \ - .long 14472b - 14470b; \ + .long 14472b - .; \ .short line; #else #define _BUGVERBOSE_LOCATION(file, line) @@ -25,7 +25,7 @@ #define __BUG_ENTRY(flags) \ .pushsection __bug_table,"aw"; \ .align 2; \ - 14470: .long 14471f - 14470b; \ + 14470: .long 14471f - .; \ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ .short flags; \ .popsection; \ diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index ecbae1832de3..61a4736355c2 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -13,7 +13,8 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b, 5002f - 5001b +5001: .4byte \addr - . + .4byte 5002f - . .short \line, \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -24,7 +25,7 @@ #else .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b +5001: .4byte \addr - . .short \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -49,15 +50,16 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b, %0 - 2b\n" \ - "\t.short %1, %2\n" \ + "2: .4byte 1b - .\n" \ + " .4byte %0 - .\n" \ + " .short %1, %2\n" \ ".org 2b+%3\n" \ ".previous\n" #else #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b\n" \ - "\t.short %2\n" \ + "2: .4byte 1b - .\n" \ + " .short %2\n" \ ".org 2b+%3\n" \ ".previous\n" #endif diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index d3804a2f9aad..1aaea81fb141 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -30,8 +30,8 @@ typedef u32 bug_insn_t; #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS -#define __BUG_ENTRY_ADDR RISCV_INT " 1b - 2b" -#define __BUG_ENTRY_FILE RISCV_INT " %0 - 2b" +#define __BUG_ENTRY_ADDR RISCV_INT " 1b - ." +#define __BUG_ENTRY_FILE RISCV_INT " %0 - ." #else #define __BUG_ENTRY_ADDR RISCV_PTR " 1b" #define __BUG_ENTRY_FILE RISCV_PTR " %0" diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 0b25f28351ed..aebe1e22c7be 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,8 @@ "1: .asciz \""__FILE__"\"\n" \ ".previous\n" \ ".section __bug_table,\"awM\",@progbits,%2\n" \ - "2: .long 0b-2b,1b-2b\n" \ + "2: .long 0b-.\n" \ + " .long 1b-.\n" \ " .short %0,%1\n" \ " .org 2b+%2\n" \ ".previous\n" \ @@ -30,7 +31,7 @@ asm_inline volatile( \ "0: mc 0,0\n" \ ".section __bug_table,\"awM\",@progbits,%1\n" \ - "1: .long 0b-1b\n" \ + "1: .long 0b-.\n" \ " .short %0\n" \ " .org 1b+%1\n" \ ".previous\n" \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 4d20a293c6fd..76fbe24cccf9 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -18,7 +18,7 @@ #ifdef CONFIG_X86_32 # define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" +# define __BUG_REL(val) ".long " __stringify(val) " - ." #endif #ifdef CONFIG_DEBUG_BUGVERBOSE diff --git a/lib/bug.c b/lib/bug.c index 45a0584f6541..c223a2575b72 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -6,8 +6,7 @@ CONFIG_BUG - emit BUG traps. Nothing happens without this. CONFIG_GENERIC_BUG - enable this code. - CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to - the containing struct bug_entry for bug_addr and file. + CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit relative pointers for bug_addr and file CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable @@ -53,10 +52,10 @@ extern struct bug_entry __start___bug_table[], __stop___bug_table[]; static inline unsigned long bug_addr(const struct bug_entry *bug) { -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - return bug->bug_addr; +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + return (unsigned long)&bug->bug_addr_disp + bug->bug_addr_disp; #else - return (unsigned long)bug + bug->bug_addr_disp; + return bug->bug_addr; #endif } @@ -131,10 +130,10 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, unsigned int *line) { #ifdef CONFIG_DEBUG_BUGVERBOSE -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - *file = bug->file; +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + *file = (const char *)&bug->file_disp + bug->file_disp; #else - *file = (const char *)bug + bug->file_disp; + *file = bug->file; #endif *line = bug->line; #else -- cgit v1.2.3 From af8b9f352ffd435734ab8f94f99ccb922da916b4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:17 +0200 Subject: powerpc/ftrace: Minimise number of #ifdefs A lot of #ifdefs can be replaced by IS_ENABLED() Do so. Signed-off-by: Christophe Leroy [mpe: Fold in changes suggested by Naveen and Christophe on list] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/18ce6708d6f8c71d87436f9c6019f04df4125128.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 2 - arch/powerpc/include/asm/module.h | 2 - arch/powerpc/kernel/trace/ftrace.c | 175 +++++++++++++++---------------- 3 files changed, 85 insertions(+), 94 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 2f73c0900040..1c6316ec4b74 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -219,7 +219,6 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name) return addr; } -#ifdef CONFIG_PPC64 /* * Some instruction encodings commonly used in dynamic ftracing * and function live patching. @@ -236,6 +235,5 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name) /* usually preceded by a mflr r0 */ #define PPC_INST_STD_LR PPC_RAW_STD(_R0, _R1, PPC_LR_STKOFF) -#endif /* CONFIG_PPC64 */ #endif /* _ASM_POWERPC_CODE_PATCHING_H */ diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index 857d9ff24295..09e2ffd360bb 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -41,9 +41,7 @@ struct mod_arch_specific { #ifdef CONFIG_DYNAMIC_FTRACE unsigned long tramp; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS unsigned long tramp_regs; -#endif #endif /* List of BUG addresses, source line numbers and filenames */ diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 964453beb2c1..2807136a1c19 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -150,26 +150,39 @@ __ftrace_make_nop(struct module *mod, return -EINVAL; } - /* When using -mprofile-kernel or PPC32 there is no load to jump over */ - pop = ppc_inst(PPC_RAW_NOP()); + if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { + if (copy_inst_from_kernel_nofault(&op, (void *)(ip - 4))) { + pr_err("Fetching instruction at %lx failed.\n", ip - 4); + return -EFAULT; + } -#ifdef CONFIG_PPC64 -#ifdef CONFIG_MPROFILE_KERNEL - if (copy_inst_from_kernel_nofault(&op, (void *)(ip - 4))) { - pr_err("Fetching instruction at %lx failed.\n", ip - 4); - return -EFAULT; - } + /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ + if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_MFLR(_R0))) && + !ppc_inst_equal(op, ppc_inst(PPC_INST_STD_LR))) { + pr_err("Unexpected instruction %s around bl _mcount\n", + ppc_inst_as_str(op)); + return -EINVAL; + } + } else if (IS_ENABLED(CONFIG_PPC64)) { + /* + * Check what is in the next instruction. We can see ld r2,40(r1), but + * on first pass after boot we will see mflr r0. + */ + if (copy_inst_from_kernel_nofault(&op, (void *)(ip + 4))) { + pr_err("Fetching op failed.\n"); + return -EFAULT; + } - /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ - if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_MFLR(_R0))) && - !ppc_inst_equal(op, ppc_inst(PPC_INST_STD_LR))) { - pr_err("Unexpected instruction %s around bl _mcount\n", - ppc_inst_as_str(op)); - return -EINVAL; + if (!ppc_inst_equal(op, ppc_inst(PPC_INST_LD_TOC))) { + pr_err("Expected %08lx found %s\n", PPC_INST_LD_TOC, ppc_inst_as_str(op)); + return -EINVAL; + } } -#else + /* - * Our original call site looks like: + * When using -mprofile-kernel or PPC32 there is no load to jump over. + * + * Otherwise our original call site looks like: * * bl * ld r2,XX(r1) @@ -181,24 +194,10 @@ __ftrace_make_nop(struct module *mod, * * Use a b +8 to jump over the load. */ - - pop = ppc_inst(PPC_RAW_BRANCH(8)); /* b +8 */ - - /* - * Check what is in the next instruction. We can see ld r2,40(r1), but - * on first pass after boot we will see mflr r0. - */ - if (copy_inst_from_kernel_nofault(&op, (void *)(ip + 4))) { - pr_err("Fetching op failed.\n"); - return -EFAULT; - } - - if (!ppc_inst_equal(op, ppc_inst(PPC_INST_LD_TOC))) { - pr_err("Expected %08lx found %s\n", PPC_INST_LD_TOC, ppc_inst_as_str(op)); - return -EINVAL; - } -#endif /* CONFIG_MPROFILE_KERNEL */ -#endif /* PPC64 */ + if (IS_ENABLED(CONFIG_MPROFILE_KERNEL) || IS_ENABLED(CONFIG_PPC32)) + pop = ppc_inst(PPC_RAW_NOP()); + else + pop = ppc_inst(PPC_RAW_BRANCH(8)); /* b +8 */ if (patch_instruction((u32 *)ip, pop)) { pr_err("Patching NOP failed.\n"); @@ -207,6 +206,11 @@ __ftrace_make_nop(struct module *mod, return 0; } +#else +static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) +{ + return 0; +} #endif /* CONFIG_MODULES */ static unsigned long find_ftrace_tramp(unsigned long ip) @@ -279,11 +283,11 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) } /* Let's re-write the tramp to go to ftrace_[regs_]caller */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - ptr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - ptr = ppc_global_function_entry((void *)ftrace_caller); -#endif + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + ptr = ppc_global_function_entry((void *)ftrace_regs_caller); + else + ptr = ppc_global_function_entry((void *)ftrace_caller); + if (patch_branch((u32 *)tramp, ptr, 0)) { pr_debug("REL24 out of range!\n"); return -1; @@ -352,10 +356,12 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr, 1); new = ppc_inst(PPC_RAW_NOP()); return ftrace_modify_code(ip, old, new); - } else if (core_kernel_text(ip)) + } else if (core_kernel_text(ip)) { return __ftrace_make_nop_kernel(rec, addr); + } else if (!IS_ENABLED(CONFIG_MODULES)) { + return -EINVAL; + } -#ifdef CONFIG_MODULES /* * Out of range jumps are called from modules. * We should either already have a pointer to the module @@ -378,10 +384,6 @@ int ftrace_make_nop(struct module *mod, mod = rec->arch.mod; return __ftrace_make_nop(mod, rec, addr); -#else - /* We should not get here without modules */ - return -EINVAL; -#endif /* CONFIG_MODULES */ } #ifdef CONFIG_MODULES @@ -411,10 +413,9 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) if (copy_inst_from_kernel_nofault(op, ip)) return -EFAULT; -#ifdef CONFIG_PPC64_ELF_ABI_V1 - if (copy_inst_from_kernel_nofault(op + 1, ip + 4)) + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1) && + copy_inst_from_kernel_nofault(op + 1, ip + 4)) return -EFAULT; -#endif if (!expected_nop_sequence(ip, op[0], op[1])) { pr_err("Unexpected call sequence at %p: %s %s\n", @@ -423,20 +424,15 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* If we never set up ftrace trampoline(s), then bail */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - if (!mod->arch.tramp || !mod->arch.tramp_regs) { -#else - if (!mod->arch.tramp) { -#endif + if (!mod->arch.tramp || + (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && !mod->arch.tramp_regs)) { pr_err("No ftrace trampoline\n"); return -EINVAL; } -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - if (rec->flags & FTRACE_FL_REGS) + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && rec->flags & FTRACE_FL_REGS) tramp = mod->arch.tramp_regs; else -#endif tramp = mod->arch.tramp; if (module_trampoline_target(mod, tramp, &ptr)) { @@ -460,6 +456,11 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return 0; } +#else +static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + return 0; +} #endif /* CONFIG_MODULES */ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) @@ -472,16 +473,12 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) entry = ppc_global_function_entry((void *)ftrace_caller); ptr = ppc_global_function_entry((void *)addr); - if (ptr != entry) { -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + if (ptr != entry && IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) entry = ppc_global_function_entry((void *)ftrace_regs_caller); - if (ptr != entry) { -#endif - pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); - return -EINVAL; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - } -#endif + + if (ptr != entry) { + pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); + return -EINVAL; } /* Make sure we have a nop */ @@ -524,10 +521,13 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ppc_inst(PPC_RAW_NOP()); new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); - } else if (core_kernel_text(ip)) + } else if (core_kernel_text(ip)) { return __ftrace_make_call_kernel(rec, addr); + } else if (!IS_ENABLED(CONFIG_MODULES)) { + /* We should not get here without modules */ + return -EINVAL; + } -#ifdef CONFIG_MODULES /* * Out of range jumps are called from modules. * Being that we are converting from nop, it had better @@ -539,10 +539,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } return __ftrace_make_call(rec, addr); -#else - /* We should not get here without modules */ - return -EINVAL; -#endif /* CONFIG_MODULES */ } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS @@ -633,6 +629,11 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } +#else +static int __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) +{ + return 0; +} #endif int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, @@ -657,9 +658,11 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, * variant, so there is nothing to do here */ return 0; + } else if (!IS_ENABLED(CONFIG_MODULES)) { + /* We should not get here without modules */ + return -EINVAL; } -#ifdef CONFIG_MODULES /* * Out of range jumps are called from modules. */ @@ -669,10 +672,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } return __ftrace_modify_call(rec, old_addr, addr); -#else - /* We should not get here without modules */ - return -EINVAL; -#endif /* CONFIG_MODULES */ } #endif @@ -686,15 +685,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func) new = ftrace_call_replace(ip, (unsigned long)func, 1); ret = ftrace_modify_code(ip, old, new); -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* Also update the regs callback function */ - if (!ret) { + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && !ret) { ip = (unsigned long)(&ftrace_regs_call); old = ppc_inst_read((u32 *)&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func, 1); ret = ftrace_modify_code(ip, old, new); } -#endif return ret; } @@ -724,12 +721,15 @@ int __init ftrace_dyn_arch_init(void) PPC_RAW_MTCTR(_R12), PPC_RAW_BCTR() }; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); -#endif - long reladdr = addr - kernel_toc_addr(); + unsigned long addr; + long reladdr; + + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + addr = ppc_global_function_entry((void *)ftrace_regs_caller); + else + addr = ppc_global_function_entry((void *)ftrace_caller); + + reladdr = addr - kernel_toc_addr(); if (reladdr >= SZ_2G || reladdr < -(long)SZ_2G) { pr_err("Address of %ps out of range of kernel_toc.\n", @@ -746,11 +746,6 @@ int __init ftrace_dyn_arch_init(void) return 0; } -#else -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From 8dfdbe4368c09d9eeae2df8968ee6c345ec8c1b5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:18 +0200 Subject: powerpc/inst: Add __copy_inst_from_kernel_nofault() On the same model as get_user() versus __get_user(), introduce __copy_inst_from_kernel_nofault() which doesn't check address. To be used by callers that have already checked that the adress is a kernel address. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1f3702890d6dbd64702b61834753bcc96851c18c.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/inst.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 80b6d74146c6..b49aae9f6f27 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -158,13 +158,10 @@ static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], ppc_inst_t x) __str; \ }) -static inline int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) +static inline int __copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) { unsigned int val, suffix; - if (unlikely(!is_kernel_addr((unsigned long)src))) - return -ERANGE; - /* See https://github.com/ClangBuiltLinux/linux/issues/1521 */ #if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 140000 val = suffix = 0; @@ -181,4 +178,12 @@ Efault: return -EFAULT; } +static inline int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) +{ + if (unlikely(!is_kernel_addr((unsigned long)src))) + return -ERANGE; + + return __copy_inst_from_kernel_nofault(inst, src); +} + #endif /* _ASM_POWERPC_INST_H */ -- cgit v1.2.3 From 4390a58ee1c37dc915dcf44fabe925b160f5bcf0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:20 +0200 Subject: powerpc/inst: Remove PPC_INST_BRANCH Convert last users of PPC_INST_BRANCH to PPC_RAW_BRANCH() And remove PPC_INST_BRANCH. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fa8807108a2ef2287a2c9651d6e1ff7c051923d9.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 3 +-- arch/powerpc/lib/feature-fixups.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index de6534d561f7..87e42289573a 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -301,7 +301,6 @@ #define PPC_INST_ADDIS 0x3c000000 #define PPC_INST_ADD 0x7c000214 #define PPC_INST_DIVD 0x7c0003d2 -#define PPC_INST_BRANCH 0x48000000 #define PPC_INST_BL 0x48000001 #define PPC_INST_BRANCH_COND 0x40800000 @@ -586,7 +585,7 @@ #define PPC_RAW_MTSPR(spr, d) (0x7c0003a6 | ___PPC_RS(d) | __PPC_SPR(spr)) #define PPC_RAW_EIEIO() (0x7c0006ac) -#define PPC_RAW_BRANCH(addr) (PPC_INST_BRANCH | ((addr) & 0x03fffffc)) +#define PPC_RAW_BRANCH(offset) (0x48000000 | PPC_LI(offset)) #define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) /* Deal with instructions that older assemblers aren't aware of */ diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 343a78826035..993d3f31832a 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -451,7 +451,7 @@ static int __do_rfi_flush_fixups(void *data) if (types & L1D_FLUSH_FALLBACK) /* b .+16 to fallback flush */ - instrs[0] = PPC_INST_BRANCH | 16; + instrs[0] = PPC_RAW_BRANCH(16); i = 0; if (types & L1D_FLUSH_ORI) { -- cgit v1.2.3 From ae2c760fa10ba2475aa46fffa6be42050586c604 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:22 +0200 Subject: powerpc/inst: Remove PPC_INST_BL Convert last users of PPC_INST_BL to PPC_RAW_BL() And remove PPC_INST_BL. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d9eacb758e7ae7cf224211ebe3f6f7d409a333be.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 1 - arch/powerpc/net/bpf_jit.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 87e42289573a..6cc12c6aa352 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -301,7 +301,6 @@ #define PPC_INST_ADDIS 0x3c000000 #define PPC_INST_ADD 0x7c000214 #define PPC_INST_DIVD 0x7c0003d2 -#define PPC_INST_BL 0x48000001 #define PPC_INST_BRANCH_COND 0x40800000 /* Prefixes */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 80d973da9093..a4f7880f959d 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -35,7 +35,7 @@ } while (0) /* bl (unconditional 'branch' with link) */ -#define PPC_BL(dest) EMIT(PPC_INST_BL | (((dest) - (unsigned long)(image + ctx->idx)) & 0x03fffffc)) +#define PPC_BL(dest) EMIT(PPC_RAW_BL((dest) - (unsigned long)(image + ctx->idx))) /* "cond" here covers BO:BI fields. */ #define PPC_BCC_SHORT(cond, dest) \ -- cgit v1.2.3 From 6bdc81eca9519a85d36b3915136640ef9cba1a23 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 9 May 2022 07:36:23 +0200 Subject: powerpc/opcodes: Remove unused PPC_INST_XXX macros The following PPC_INST_XXX macros are not used anymore outside ppc-opcode.h: - PPC_INST_LD - PPC_INST_STD - PPC_INST_ADDIS - PPC_INST_ADD - PPC_INST_DIVD Remove them. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8c28636126f69141419953b5638b4a908c184dc1.1652074503.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 6cc12c6aa352..89beabf5325c 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -296,11 +296,6 @@ #define PPC_INST_TRECHKPT 0x7c0007dd #define PPC_INST_TRECLAIM 0x7c00075d #define PPC_INST_TSR 0x7c0005dd -#define PPC_INST_LD 0xe8000000 -#define PPC_INST_STD 0xf8000000 -#define PPC_INST_ADDIS 0x3c000000 -#define PPC_INST_ADD 0x7c000214 -#define PPC_INST_DIVD 0x7c0003d2 #define PPC_INST_BRANCH_COND 0x40800000 /* Prefixes */ @@ -473,10 +468,10 @@ (0x100000c7 | ___PPC_RT(vrt) | ___PPC_RA(vra) | ___PPC_RB(vrb) | __PPC_RC21) #define PPC_RAW_VCMPEQUB_RC(vrt, vra, vrb) \ (0x10000006 | ___PPC_RT(vrt) | ___PPC_RA(vra) | ___PPC_RB(vrb) | __PPC_RC21) -#define PPC_RAW_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) | ___PPC_RA(base) | IMM_DS(i)) +#define PPC_RAW_LD(r, base, i) (0xe8000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_DS(i)) #define PPC_RAW_LWZ(r, base, i) (0x80000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LWZX(t, a, b) (0x7c00002e | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) -#define PPC_RAW_STD(r, base, i) (PPC_INST_STD | ___PPC_RS(r) | ___PPC_RA(base) | IMM_DS(i)) +#define PPC_RAW_STD(r, base, i) (0xf8000000 | ___PPC_RS(r) | ___PPC_RA(base) | IMM_DS(i)) #define PPC_RAW_STDCX(s, a, b) (0x7c0001ad | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_LFSX(t, a, b) (0x7c00042e | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_STFSX(s, a, b) (0x7c00052e | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) @@ -487,8 +482,8 @@ #define PPC_RAW_ADDE(t, a, b) (0x7c000114 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_ADDZE(t, a) (0x7c000194 | ___PPC_RT(t) | ___PPC_RA(a)) #define PPC_RAW_ADDME(t, a) (0x7c0001d4 | ___PPC_RT(t) | ___PPC_RA(a)) -#define PPC_RAW_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) -#define PPC_RAW_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1) +#define PPC_RAW_ADD(t, a, b) (0x7c000214 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADD_DOT(t, a, b) (0x7c000214 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define PPC_RAW_ADDC(t, a, b) (0x7c000014 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_ADDC_DOT(t, a, b) (0x7c000014 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define PPC_RAW_NOP() PPC_RAW_ORI(0, 0, 0) -- cgit v1.2.3 From c127d130f6d59fa81701f6b04023cf7cd1972fb3 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sat, 9 Apr 2022 01:44:16 -0700 Subject: powerpc/powernv/vas: Assign real address to rx_fifo in vas_rx_win_attr In init_winctx_regs(), __pa() is called on winctx->rx_fifo and this function is called to initialize registers for receive and fault windows. But the real address is passed in winctx->rx_fifo for receive windows and the virtual address for fault windows which causes errors with DEBUG_VIRTUAL enabled. Fixes this issue by assigning only real address to rx_fifo in vas_rx_win_attr struct for both receive and fault windows. Reported-by: Michael Ellerman Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/338e958c7ab8f3b266fa794a1f80f99b9671829e.camel@linux.ibm.com --- arch/powerpc/include/asm/vas.h | 2 +- arch/powerpc/platforms/powernv/vas-fault.c | 2 +- arch/powerpc/platforms/powernv/vas-window.c | 4 ++-- arch/powerpc/platforms/powernv/vas.h | 2 +- drivers/crypto/nx/nx-common-powernv.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index 83afcb6c194b..c36f71e01c0f 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -126,7 +126,7 @@ static inline void vas_user_win_add_mm_context(struct vas_user_win_ref *ref) * Receive window attributes specified by the (in-kernel) owner of window. */ struct vas_rx_win_attr { - void *rx_fifo; + u64 rx_fifo; int rx_fifo_size; int wcreds_max; diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index a7aabc18039e..c1bfad56447d 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -216,7 +216,7 @@ int vas_setup_fault_window(struct vas_instance *vinst) vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); attr.rx_fifo_size = vinst->fault_fifo_size; - attr.rx_fifo = vinst->fault_fifo; + attr.rx_fifo = __pa(vinst->fault_fifo); /* * Max creds is based on number of CRBs can fit in the FIFO. diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 0f8d39fbf2b2..0072682531d8 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -404,7 +404,7 @@ static void init_winctx_regs(struct pnv_vas_window *window, * * See also: Design note in function header. */ - val = __pa(winctx->rx_fifo); + val = winctx->rx_fifo; val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0); write_hvwc_reg(window, VREG(LFIFO_BAR), val); @@ -739,7 +739,7 @@ static void init_winctx_for_rxwin(struct pnv_vas_window *rxwin, */ winctx->fifo_disable = true; winctx->intr_disable = true; - winctx->rx_fifo = NULL; + winctx->rx_fifo = 0; } winctx->lnotify_lpid = rxattr->lnotify_lpid; diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 8bb08e395de0..08d9d3d5a22b 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -376,7 +376,7 @@ struct pnv_vas_window { * is a container for the register fields in the window context. */ struct vas_winctx { - void *rx_fifo; + u64 rx_fifo; int rx_fifo_size; int wcreds_max; int rsvd_txbuf_count; diff --git a/drivers/crypto/nx/nx-common-powernv.c b/drivers/crypto/nx/nx-common-powernv.c index 32a036ada5d0..f418817c0f43 100644 --- a/drivers/crypto/nx/nx-common-powernv.c +++ b/drivers/crypto/nx/nx-common-powernv.c @@ -827,7 +827,7 @@ static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id, goto err_out; vas_init_rx_win_attr(&rxattr, coproc->ct); - rxattr.rx_fifo = (void *)rx_fifo; + rxattr.rx_fifo = rx_fifo; rxattr.rx_fifo_size = fifo_size; rxattr.lnotify_lpid = lpid; rxattr.lnotify_pid = pid; -- cgit v1.2.3 From c4bce84d0bd3f396f702d69be2e92bbd8af97583 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 7 Apr 2022 00:58:01 +1000 Subject: powerpc/64: Only WARN if __pa()/__va() called with bad addresses We added checks to __pa() / __va() to ensure they're only called with appropriate addresses. But using BUG_ON() is too strong, it means virt_addr_valid() will BUG when DEBUG_VIRTUAL is enabled. Instead switch them to warnings, arm64 does the same. Fixes: 4dd7554a6456 ("powerpc/64: Add VIRTUAL_BUG_ON checks for __va and __pa addresses") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220406145802.538416-5-mpe@ellerman.id.au --- arch/powerpc/include/asm/page.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b3101ffe0151..e5f75c70eda8 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -216,6 +216,9 @@ static inline bool pfn_valid(unsigned long pfn) #define __pa(x) ((phys_addr_t)(unsigned long)(x) - VIRT_PHYS_OFFSET) #else #ifdef CONFIG_PPC64 + +#define VIRTUAL_WARN_ON(x) WARN_ON(IS_ENABLED(CONFIG_DEBUG_VIRTUAL) && (x)) + /* * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit. @@ -223,13 +226,13 @@ static inline bool pfn_valid(unsigned long pfn) */ #define __va(x) \ ({ \ - VIRTUAL_BUG_ON((unsigned long)(x) >= PAGE_OFFSET); \ + VIRTUAL_WARN_ON((unsigned long)(x) >= PAGE_OFFSET); \ (void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET); \ }) #define __pa(x) \ ({ \ - VIRTUAL_BUG_ON((unsigned long)(x) < PAGE_OFFSET); \ + VIRTUAL_WARN_ON((unsigned long)(x) < PAGE_OFFSET); \ (unsigned long)(x) & 0x0fffffffffffffffUL; \ }) -- cgit v1.2.3 From 84ade0a6655bee803d176525ef457175cbf4df22 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 16 May 2022 12:44:22 +0530 Subject: powerpc/ftrace: Remove ftrace init tramp once kernel init is complete Stop using the ftrace trampoline for init section once kernel init is complete. Fixes: 67361cf8071286 ("powerpc/ftrace: Handle large kernel configs") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220516071422.463738-1-naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ftrace.h | 4 +++- arch/powerpc/kernel/trace/ftrace.c | 15 ++++++++++++--- arch/powerpc/mm/mem.c | 2 ++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b56166b7ea68..3cee7115441b 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -86,7 +86,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name #endif /* CONFIG_PPC64_ELF_ABI_V1 */ #endif /* CONFIG_FTRACE_SYSCALLS */ -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) && defined(CONFIG_FUNCTION_TRACER) #include static inline void this_cpu_disable_ftrace(void) @@ -110,11 +110,13 @@ static inline u8 this_cpu_get_ftrace_enabled(void) return get_paca()->ftrace_enabled; } +void ftrace_free_init_tramp(void); #else /* CONFIG_PPC64 */ static inline void this_cpu_disable_ftrace(void) { } static inline void this_cpu_enable_ftrace(void) { } static inline void this_cpu_set_ftrace_enabled(u8 ftrace_enabled) { } static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; } +static inline void ftrace_free_init_tramp(void) { } #endif /* CONFIG_PPC64 */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 2807136a1c19..2a893e06e4f1 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -257,9 +257,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) /* Is this a known long jump tramp? */ for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) - break; - else if (ftrace_tramps[i] == tramp) + if (ftrace_tramps[i] == tramp) return 0; /* New trampoline -- read where this goes */ @@ -710,6 +708,17 @@ void arch_ftrace_update_code(int command) extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; +void ftrace_free_init_tramp(void) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS && ftrace_tramps[i]; i++) + if (ftrace_tramps[i] == (unsigned long)ftrace_tramp_init) { + ftrace_tramps[i] = 0; + return; + } +} + int __init ftrace_dyn_arch_init(void) { int i; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 177fae5ea0d1..09b7b5e7bb9a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -314,6 +315,7 @@ void free_initmem(void) mark_initmem_nx(); static_branch_enable(&init_mem_is_free); free_initmem_default(POISON_FREE_INITMEM); + ftrace_free_init_tramp(); } /* -- cgit v1.2.3 From 5352090a999570c6e8a701bcb755fd91e8c5a2cd Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 18 May 2022 20:06:17 +1000 Subject: powerpc/kasan: Don't instrument non-maskable or raw interrupts Disable address sanitization for raw and non-maskable interrupt handlers, because they can run in real mode, where we cannot access the shadow memory. (Note that kasan_arch_is_ready() doesn't test for real mode, since it is a static branch for speed, and in any case not all the entry points to the generic KASAN code are protected by kasan_arch_is_ready guards.) The changes to interrupt_nmi_enter/exit_prepare() look larger than they actually are. The changes are equivalent to adding !IS_ENABLED(CONFIG_KASAN) to the conditions for calling nmi_enter() or nmi_exit() in real mode. That is, the code is equivalent to using the following condition for calling nmi_enter/exit: if (((!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !firmware_has_feature(FW_FEATURE_LPAR) || radix_enabled()) && !IS_ENABLED(CONFIG_KASAN) || (mfmsr() & MSR_DR)) That unwieldy condition has been split into several statements with comments, for easier reading. The nmi_ipi_lock functions that call atomic functions (i.e., nmi_ipi_lock_start(), nmi_ipi_lock() and nmi_ipi_unlock()), besides being marked noinstr, now call arch_atomic_* functions instead of atomic_* functions because with KASAN enabled, the atomic_* functions are wrappers which explicitly do address sanitization on their arguments. Since we are trying to avoid address sanitization, we have to use the lower-level arch_atomic_* versions. In hv_nmi_check_nonrecoverable(), the regs_set_unrecoverable() call has been open-coded so as to avoid having to either trust the inlining or mark regs_set_unrecoverable() as noinstr. [paulus@ozlabs.org: combined a few work-in-progress commits of Daniel's and wrote the commit message.] Signed-off-by: Daniel Axtens Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/YoTFGaKM8Pd46PIK@cleo --- arch/powerpc/include/asm/interrupt.h | 52 +++++++++++++++++++++++++++--------- arch/powerpc/kernel/smp.c | 22 +++++++-------- arch/powerpc/kernel/traps.c | 6 +++-- arch/powerpc/lib/Makefile | 3 +++ arch/powerpc/platforms/powernv/smp.c | 2 +- 5 files changed, 59 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index f964ef5c57d8..b14f54d789d2 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -324,22 +324,46 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte } #endif + /* If data relocations are enabled, it's safe to use nmi_enter() */ + if (mfmsr() & MSR_DR) { + nmi_enter(); + return; + } + /* - * Do not use nmi_enter() for pseries hash guest taking a real-mode + * But do not use nmi_enter() for pseries hash guest taking a real-mode * NMI because not everything it touches is within the RMA limit. */ - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || - !firmware_has_feature(FW_FEATURE_LPAR) || - radix_enabled() || (mfmsr() & MSR_DR)) - nmi_enter(); + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && + firmware_has_feature(FW_FEATURE_LPAR) && + !radix_enabled()) + return; + + /* + * Likewise, don't use it if we have some form of instrumentation (like + * KASAN shadow) that is not safe to access in real mode (even on radix) + */ + if (IS_ENABLED(CONFIG_KASAN)) + return; + + /* Otherwise, it should be safe to call it */ + nmi_enter(); } static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state) { - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || - !firmware_has_feature(FW_FEATURE_LPAR) || - radix_enabled() || (mfmsr() & MSR_DR)) + if (mfmsr() & MSR_DR) { + // nmi_exit if relocations are on nmi_exit(); + } else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && + firmware_has_feature(FW_FEATURE_LPAR) && + !radix_enabled()) { + // no nmi_exit for a pseries hash guest taking a real mode exception + } else if (IS_ENABLED(CONFIG_KASAN)) { + // no nmi_exit for KASAN in real mode + } else { + nmi_exit(); + } /* * nmi does not call nap_adjust_return because nmi should not create @@ -407,7 +431,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter * Specific handlers may have additional restrictions. */ #define DEFINE_INTERRUPT_HANDLER_RAW(func) \ -static __always_inline long ____##func(struct pt_regs *regs); \ +static __always_inline __no_sanitize_address __no_kcsan long \ +____##func(struct pt_regs *regs); \ \ interrupt_handler long func(struct pt_regs *regs) \ { \ @@ -421,7 +446,8 @@ interrupt_handler long func(struct pt_regs *regs) \ } \ NOKPROBE_SYMBOL(func); \ \ -static __always_inline long ____##func(struct pt_regs *regs) +static __always_inline __no_sanitize_address __no_kcsan long \ +____##func(struct pt_regs *regs) /** * DECLARE_INTERRUPT_HANDLER - Declare synchronous interrupt handler function @@ -541,7 +567,8 @@ static __always_inline void ____##func(struct pt_regs *regs) * body with a pair of curly brackets. */ #define DEFINE_INTERRUPT_HANDLER_NMI(func) \ -static __always_inline long ____##func(struct pt_regs *regs); \ +static __always_inline __no_sanitize_address __no_kcsan long \ +____##func(struct pt_regs *regs); \ \ interrupt_handler long func(struct pt_regs *regs) \ { \ @@ -558,7 +585,8 @@ interrupt_handler long func(struct pt_regs *regs) \ } \ NOKPROBE_SYMBOL(func); \ \ -static __always_inline long ____##func(struct pt_regs *regs) +static __always_inline __no_sanitize_address __no_kcsan long \ +____##func(struct pt_regs *regs) /* Interrupt handlers */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 4c4511b6a75d..4335efcb3184 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -411,32 +411,32 @@ static struct cpumask nmi_ipi_pending_mask; static bool nmi_ipi_busy = false; static void (*nmi_ipi_function)(struct pt_regs *) = NULL; -static void nmi_ipi_lock_start(unsigned long *flags) +noinstr static void nmi_ipi_lock_start(unsigned long *flags) { raw_local_irq_save(*flags); hard_irq_disable(); - while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { + while (arch_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); + spin_until_cond(arch_atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } } -static void nmi_ipi_lock(void) +noinstr static void nmi_ipi_lock(void) { - while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); + while (arch_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) + spin_until_cond(arch_atomic_read(&__nmi_ipi_lock) == 0); } -static void nmi_ipi_unlock(void) +noinstr static void nmi_ipi_unlock(void) { smp_mb(); - WARN_ON(atomic_read(&__nmi_ipi_lock) != 1); - atomic_set(&__nmi_ipi_lock, 0); + WARN_ON(arch_atomic_read(&__nmi_ipi_lock) != 1); + arch_atomic_set(&__nmi_ipi_lock, 0); } -static void nmi_ipi_unlock_end(unsigned long *flags) +noinstr static void nmi_ipi_unlock_end(unsigned long *flags) { nmi_ipi_unlock(); raw_local_irq_restore(*flags); @@ -445,7 +445,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags) /* * Platform NMI handler calls this to ack */ -int smp_handle_nmi_ipi(struct pt_regs *regs) +noinstr int smp_handle_nmi_ipi(struct pt_regs *regs) { void (*fn)(struct pt_regs *) = NULL; unsigned long flags; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a08bb7cefdc5..3aaa50e5c72f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -393,7 +393,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * Builds that do not support KVM could take this second option to increase * the recoverability of NMIs. */ -void hv_nmi_check_nonrecoverable(struct pt_regs *regs) +noinstr void hv_nmi_check_nonrecoverable(struct pt_regs *regs) { #ifdef CONFIG_PPC_POWERNV unsigned long kbase = (unsigned long)_stext; @@ -433,7 +433,9 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) return; nonrecoverable: - regs_set_unrecoverable(regs); + regs->msr &= ~MSR_RI; + local_paca->hsrr_valid = 0; + local_paca->srr_valid = 0; #endif } DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 5d1881d2e39a..8560c912186d 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -13,6 +13,9 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) KASAN_SANITIZE_code-patching.o := n KASAN_SANITIZE_feature-fixups.o := n +# restart_table.o contains functions called in the NMI interrupt path +# which can be in real mode. Disable KASAN. +KASAN_SANITIZE_restart_table.o := n ifdef CONFIG_KASAN CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index cbb67813cd5d..9e1a25398f98 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -345,7 +345,7 @@ static void __init pnv_smp_probe(void) } } -static int pnv_system_reset_exception(struct pt_regs *regs) +noinstr static int pnv_system_reset_exception(struct pt_regs *regs) { if (smp_handle_nmi_ipi(regs)) return 1; -- cgit v1.2.3 From 41b7a347bf1491e7300563bb224432608b41f62a Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 18 May 2022 20:05:31 +1000 Subject: powerpc: Book3S 64-bit outline-only KASAN support Implement a limited form of KASAN for Book3S 64-bit machines running under the Radix MMU, supporting only outline mode. - Enable the compiler instrumentation to check addresses and maintain the shadow region. (This is the guts of KASAN which we can easily reuse.) - Require kasan-vmalloc support to handle modules and anything else in vmalloc space. - KASAN needs to be able to validate all pointer accesses, but we can't instrument all kernel addresses - only linear map and vmalloc. On boot, set up a single page of read-only shadow that marks all iomap and vmemmap accesses as valid. - Document KASAN in powerpc docs. Background ---------- KASAN support on Book3S is a bit tricky to get right: - It would be good to support inline instrumentation so as to be able to catch stack issues that cannot be caught with outline mode. - Inline instrumentation requires a fixed offset. - Book3S runs code with translations off ("real mode") during boot, including a lot of generic device-tree parsing code which is used to determine MMU features. [ppc64 mm note: The kernel installs a linear mapping at effective address c000...-c008.... This is a one-to-one mapping with physical memory from 0000... onward. Because of how memory accesses work on powerpc 64-bit Book3S, a kernel pointer in the linear map accesses the same memory both with translations on (accessing as an 'effective address'), and with translations off (accessing as a 'real address'). This works in both guests and the hypervisor. For more details, see s5.7 of Book III of version 3 of the ISA, in particular the Storage Control Overview, s5.7.3, and s5.7.5 - noting that this KASAN implementation currently only supports Radix.] - Some code - most notably a lot of KVM code - also runs with translations off after boot. - Therefore any offset has to point to memory that is valid with translations on or off. One approach is just to give up on inline instrumentation. This way boot-time checks can be delayed until after the MMU is set is up, and we can just not instrument any code that runs with translations off after booting. Take this approach for now and require outline instrumentation. Previous attempts allowed inline instrumentation. However, they came with some unfortunate restrictions: only physically contiguous memory could be used and it had to be specified at compile time. Maybe we can do better in the future. [paulus@ozlabs.org - Rebased onto 5.17. Note that a kernel with CONFIG_KASAN=y will crash during boot on a machine using HPT translation because not all the entry points to the generic KASAN code are protected with a call to kasan_arch_is_ready().] Originally-by: Balbir Singh # ppc64 out-of-line radix version Signed-off-by: Daniel Axtens Signed-off-by: Paul Mackerras [mpe: Update copyright year and comment formatting] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/YoTE69OQwiG7z+Gu@cleo --- Documentation/powerpc/kasan.txt | 48 ++++++++++++- arch/powerpc/Kconfig | 5 +- arch/powerpc/Kconfig.debug | 3 +- arch/powerpc/include/asm/book3s/64/hash.h | 4 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 3 + arch/powerpc/include/asm/book3s/64/radix.h | 12 +++- arch/powerpc/include/asm/kasan.h | 22 ++++++ arch/powerpc/kernel/Makefile | 11 +++ arch/powerpc/kvm/Makefile | 5 ++ arch/powerpc/mm/book3s64/Makefile | 9 +++ arch/powerpc/mm/kasan/Makefile | 1 + arch/powerpc/mm/kasan/init_book3s_64.c | 102 +++++++++++++++++++++++++++ arch/powerpc/mm/ptdump/ptdump.c | 3 +- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/powerpc/platforms/powernv/Makefile | 8 +++ arch/powerpc/platforms/pseries/Makefile | 2 + 16 files changed, 232 insertions(+), 7 deletions(-) create mode 100644 arch/powerpc/mm/kasan/init_book3s_64.c (limited to 'arch/powerpc/include') diff --git a/Documentation/powerpc/kasan.txt b/Documentation/powerpc/kasan.txt index 26bb0e8bb18c..f032b4eaf205 100644 --- a/Documentation/powerpc/kasan.txt +++ b/Documentation/powerpc/kasan.txt @@ -1,4 +1,4 @@ -KASAN is supported on powerpc on 32-bit only. +KASAN is supported on powerpc on 32-bit and Radix 64-bit only. 32 bit support ============== @@ -10,3 +10,49 @@ fixmap area and occupies one eighth of the total kernel virtual memory space. Instrumentation of the vmalloc area is optional, unless built with modules, in which case it is required. + +64 bit support +============== + +Currently, only the radix MMU is supported. There have been versions for hash +and Book3E processors floating around on the mailing list, but nothing has been +merged. + +KASAN support on Book3S is a bit tricky to get right: + + - It would be good to support inline instrumentation so as to be able to catch + stack issues that cannot be caught with outline mode. + + - Inline instrumentation requires a fixed offset. + + - Book3S runs code with translations off ("real mode") during boot, including a + lot of generic device-tree parsing code which is used to determine MMU + features. + + - Some code - most notably a lot of KVM code - also runs with translations off + after boot. + + - Therefore any offset has to point to memory that is valid with + translations on or off. + +One approach is just to give up on inline instrumentation. This way boot-time +checks can be delayed until after the MMU is set is up, and we can just not +instrument any code that runs with translations off after booting. This is the +current approach. + +To avoid this limitiation, the KASAN shadow would have to be placed inside the +linear mapping, using the same high-bits trick we use for the rest of the linear +mapping. This is tricky: + + - We'd like to place it near the start of physical memory. In theory we can do + this at run-time based on how much physical memory we have, but this requires + being able to arbitrarily relocate the kernel, which is basically the tricky + part of KASLR. Not being game to implement both tricky things at once, this + is hopefully something we can revisit once we get KASLR for Book3S. + + - Alternatively, we can place the shadow at the _end_ of memory, but this + requires knowing how much contiguous physical memory a system has _at compile + time_. This is a big hammer, and has some unfortunate consequences: inablity + to handle discontiguous physical memory, total failure to boot on machines + with less memory than specified, and that machines with more memory than + specified can't use it. This was deemed unacceptable. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index df202cb6a841..f5ed355e75e6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -109,6 +109,7 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_COPY_MC if PPC64 @@ -157,6 +158,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANTS_NO_INSTR select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_TABLE_SORT @@ -188,7 +190,8 @@ config PPC select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 - select HAVE_ARCH_KASAN_VMALLOC if PPC32 && PPC_PAGE_SHIFT <= 14 + select HAVE_ARCH_KASAN if PPC_RADIX_MMU + select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_KFENCE if PPC_BOOK3S_32 || PPC_8xx || 40x select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 192f0ed0097f..9f363c143d86 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -374,4 +374,5 @@ config PPC_FAST_ENDIAN_SWITCH config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xe0000000 + default 0xe0000000 if PPC32 + default 0xa80e000000000000 if PPC64 diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index a7a0572f3846..17e7a778c856 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -18,6 +18,10 @@ #include #endif +#define H_PTRS_PER_PTE (1 << H_PTE_INDEX_SIZE) +#define H_PTRS_PER_PMD (1 << H_PMD_INDEX_SIZE) +#define H_PTRS_PER_PUD (1 << H_PUD_INDEX_SIZE) + /* Bits to set in a PMD/PUD/PGD entry valid bit*/ #define HASH_PMD_VAL_BITS (0x8000000000000000UL) #define HASH_PUD_VAL_BITS (0x8000000000000000UL) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 875730d5af40..010eb373fcb3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -232,6 +232,9 @@ extern unsigned long __pmd_frag_size_shift; #define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) +#define MAX_PTRS_PER_PTE ((H_PTRS_PER_PTE > R_PTRS_PER_PTE) ? H_PTRS_PER_PTE : R_PTRS_PER_PTE) +#define MAX_PTRS_PER_PMD ((H_PTRS_PER_PMD > R_PTRS_PER_PMD) ? H_PTRS_PER_PMD : R_PTRS_PER_PMD) +#define MAX_PTRS_PER_PUD ((H_PTRS_PER_PUD > R_PTRS_PER_PUD) ? H_PTRS_PER_PUD : R_PTRS_PER_PUD) #define MAX_PTRS_PER_PGD (1 << (H_PGD_INDEX_SIZE > RADIX_PGD_INDEX_SIZE ? \ H_PGD_INDEX_SIZE : RADIX_PGD_INDEX_SIZE)) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index d090d9612348..686001eda936 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -35,6 +35,11 @@ #define RADIX_PMD_SHIFT (PAGE_SHIFT + RADIX_PTE_INDEX_SIZE) #define RADIX_PUD_SHIFT (RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE) #define RADIX_PGD_SHIFT (RADIX_PUD_SHIFT + RADIX_PUD_INDEX_SIZE) + +#define R_PTRS_PER_PTE (1 << RADIX_PTE_INDEX_SIZE) +#define R_PTRS_PER_PMD (1 << RADIX_PMD_INDEX_SIZE) +#define R_PTRS_PER_PUD (1 << RADIX_PUD_INDEX_SIZE) + /* * Size of EA range mapped by our pagetables. */ @@ -68,11 +73,11 @@ * * * 3rd quadrant expanded: - * +------------------------------+ - * | | + * +------------------------------+ Highest address (0xc010000000000000) + * +------------------------------+ KASAN shadow end (0xc00fc00000000000) * | | * | | - * +------------------------------+ Kernel vmemmap end (0xc010000000000000) + * +------------------------------+ Kernel vmemmap end/shadow start (0xc00e000000000000) * | | * | 512TB | * | | @@ -91,6 +96,7 @@ * +------------------------------+ Kernel linear (0xc.....) */ +/* For the sizes of the shadow area, see kasan.h */ /* * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 3c478e5ef24c..a6be4025cba2 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -30,9 +30,31 @@ #define KASAN_SHADOW_OFFSET ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET) +#ifdef CONFIG_PPC32 #define KASAN_SHADOW_END (-(-KASAN_SHADOW_START >> KASAN_SHADOW_SCALE_SHIFT)) +#elif defined(CONFIG_PPC_BOOK3S_64) +/* + * The shadow ends before the highest accessible address + * because we don't need a shadow for the shadow. Instead: + * c00e000000000000 << 3 + a80e000000000000 = c00fc00000000000 + */ +#define KASAN_SHADOW_END 0xc00fc00000000000UL +#endif #ifdef CONFIG_KASAN +#ifdef CONFIG_PPC_BOOK3S_64 +DECLARE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); + +static __always_inline bool kasan_arch_is_ready(void) +{ + if (static_branch_likely(&powerpc_kasan_enabled_key)) + return true; + return false; +} + +#define kasan_arch_is_ready kasan_arch_is_ready +#endif + void kasan_early_init(void); void kasan_mmu_init(void); void kasan_init(void); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 255ba5a3692d..2e2a2a9bcf43 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -33,6 +33,17 @@ KASAN_SANITIZE_early_32.o := n KASAN_SANITIZE_cputable.o := n KASAN_SANITIZE_prom_init.o := n KASAN_SANITIZE_btext.o := n +KASAN_SANITIZE_paca.o := n +KASAN_SANITIZE_setup_64.o := n +KASAN_SANITIZE_mce.o := n +KASAN_SANITIZE_mce_power.o := n + +# we have to be particularly careful in ppc64 to exclude code that +# runs with translations off, as we cannot access the shadow with +# translations off. However, ppc32 can sanitize this. +ifdef CONFIG_PPC64 +KASAN_SANITIZE_traps.o := n +endif ifdef CONFIG_KASAN CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index f17379b0f161..0cd23ce07d68 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -131,3 +131,8 @@ obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o obj-y += $(kvm-book3s_64-builtin-objs-y) + +# KVM does a lot in real-mode, and 64-bit Book3S KASAN doesn't support that +ifdef CONFIG_PPC_BOOK3S_64 +KASAN_SANITIZE := n +endif diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index d527dc8e30a8..cad2abc1730f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -24,3 +24,12 @@ obj-$(CONFIG_PPC_PKEY) += pkeys.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n + +# Parts of these can run in real mode and therefore are +# not safe with the current outline KASAN implementation +KASAN_SANITIZE_mmu_context.o := n +KASAN_SANITIZE_pgtable.o := n +KASAN_SANITIZE_radix_pgtable.o := n +KASAN_SANITIZE_radix_tlb.o := n +KASAN_SANITIZE_slb.o := n +KASAN_SANITIZE_pkeys.o := n diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index bcbfd6f2eca3..4999aadb1867 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -5,3 +5,4 @@ KASAN_SANITIZE := n obj-$(CONFIG_PPC32) += init_32.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o +obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c new file mode 100644 index 000000000000..0da5566d6b84 --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3S powerpc + * + * Copyright 2019-2022, Daniel Axtens, IBM Corporation. + */ + +/* + * ppc64 turns on virtual memory late in boot, after calling into generic code + * like the device-tree parser, so it uses this in conjunction with a hook in + * outline mode to avoid invalid access early in boot. + */ + +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_init(void) +{ + /* + * We want to do the following things: + * 1) Map real memory into the shadow for all physical memblocks + * This takes us from c000... to c008... + * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC + * will manage this for us. + * This takes us from c008... to c00a... + * 3) Map the 'early shadow'/zero page over iomap and vmemmap space. + * This takes us up to where we start at c00e... + */ + + void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END); + void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END); + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + if (!early_radix_enabled()) { + pr_warn("KASAN not enabled as it requires radix!"); + return; + } + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region((void *)start, (void *)end); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + /* map the early shadow over the iomap and vmemmap space */ + kasan_populate_early_shadow(k_start, k_end); + + /* mark early shadow region as RO and wipe it */ + zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + /* + * clear_page relies on some cache info that hasn't been set up yet. + * It ends up looping ~forever and blows up other data. + * Use memset instead. + */ + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + static_branch_inc(&powerpc_kasan_enabled_key); + + /* Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 8c846982766f..2313053fe679 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -289,11 +290,11 @@ static void populate_markers(void) #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ #ifdef CONFIG_KASAN address_markers[i++].start_address = KASAN_SHADOW_START; address_markers[i++].start_address = KASAN_SHADOW_END; #endif -#endif /* CONFIG_PPC64 */ } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 942606b2b193..9e2df4b66478 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -104,6 +104,7 @@ config PPC_BOOK3S_64 select HAVE_MOVE_PUD select IRQ_WORK select PPC_64S_HASH_MMU if !PPC_RADIX_MMU + select KASAN_VMALLOC if KASAN config PPC_BOOK3E_64 bool "Embedded processors" diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index dc7b37c23b60..6488b3842199 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,4 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 + +# nothing that deals with real mode is safe to KASAN +# in particular, idle code runs a bunch of things in real mode +KASAN_SANITIZE_idle.o := n +KASAN_SANITIZE_pci-ioda.o := n +# pnv_machine_check_early +KASAN_SANITIZE_setup.o := n + obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a1fab0955bf4..7aaff5323544 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -34,4 +34,6 @@ obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o +# nothing that operates in real mode is safe for KASAN +KASAN_SANITIZE_ras.o := n KASAN_SANITIZE_kexec.o := n -- cgit v1.2.3 From dc21ed2aef4150fc2fcf58227a4ff24502015c03 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Mar 2022 12:03:06 +0200 Subject: powerpc/85xx: Remove FSL_85XX_CACHE_SRAM CONFIG_FSL_85XX_CACHE_SRAM is an option that is not user selectable and which is not selected by any driver nor any defconfig. Remove it and all associated code. Signed-off-by: Christophe Leroy Acked-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9949813a6b758903b7bee910f798ba2ca82ff8ee.1648720908.git.christophe.leroy@csgroup.eu --- .../devicetree/bindings/powerpc/fsl/cache_sram.txt | 20 -- arch/powerpc/include/asm/fsl_85xx_cache_sram.h | 35 ---- arch/powerpc/platforms/85xx/Kconfig | 9 - arch/powerpc/sysdev/Makefile | 1 - arch/powerpc/sysdev/fsl_85xx_cache_ctlr.h | 88 --------- arch/powerpc/sysdev/fsl_85xx_cache_sram.c | 147 -------------- arch/powerpc/sysdev/fsl_85xx_l2ctlr.c | 216 --------------------- 7 files changed, 516 deletions(-) delete mode 100644 Documentation/devicetree/bindings/powerpc/fsl/cache_sram.txt delete mode 100644 arch/powerpc/include/asm/fsl_85xx_cache_sram.h delete mode 100644 arch/powerpc/sysdev/fsl_85xx_cache_ctlr.h delete mode 100644 arch/powerpc/sysdev/fsl_85xx_cache_sram.c delete mode 100644 arch/powerpc/sysdev/fsl_85xx_l2ctlr.c (limited to 'arch/powerpc/include') diff --git a/Documentation/devicetree/bindings/powerpc/fsl/cache_sram.txt b/Documentation/devicetree/bindings/powerpc/fsl/cache_sram.txt deleted file mode 100644 index 781955f5217d..000000000000 --- a/Documentation/devicetree/bindings/powerpc/fsl/cache_sram.txt +++ /dev/null @@ -1,20 +0,0 @@ -* Freescale PQ3 and QorIQ based Cache SRAM - -Freescale's mpc85xx and some QorIQ platforms provide an -option of configuring a part of (or full) cache memory -as SRAM. This cache SRAM representation in the device -tree should be done as under:- - -Required properties: - -- compatible : should be "fsl,p2020-cache-sram" -- fsl,cache-sram-ctlr-handle : points to the L2 controller -- reg : offset and length of the cache-sram. - -Example: - -cache-sram@fff00000 { - fsl,cache-sram-ctlr-handle = <&L2>; - reg = <0 0xfff00000 0 0x10000>; - compatible = "fsl,p2020-cache-sram"; -}; diff --git a/arch/powerpc/include/asm/fsl_85xx_cache_sram.h b/arch/powerpc/include/asm/fsl_85xx_cache_sram.h deleted file mode 100644 index 0235a0447baa..000000000000 --- a/arch/powerpc/include/asm/fsl_85xx_cache_sram.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2009 Freescale Semiconductor, Inc. - * - * Cache SRAM handling for QorIQ platform - * - * Author: Vivek Mahajan - - * This file is derived from the original work done - * by Sylvain Munaut for the Bestcomm SRAM allocator. - */ - -#ifndef __ASM_POWERPC_FSL_85XX_CACHE_SRAM_H__ -#define __ASM_POWERPC_FSL_85XX_CACHE_SRAM_H__ - -#include -#include - -/* - * Cache-SRAM - */ - -struct mpc85xx_cache_sram { - phys_addr_t base_phys; - void *base_virt; - unsigned int size; - rh_info_t *rh; - spinlock_t lock; -}; - -extern void mpc85xx_cache_sram_free(void *ptr); -extern void *mpc85xx_cache_sram_alloc(unsigned int size, - phys_addr_t *phys, unsigned int align); - -#endif /* __AMS_POWERPC_FSL_85XX_CACHE_SRAM_H__ */ diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index 4142ebf01382..2be17ffe8714 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -16,15 +16,6 @@ if FSL_SOC_BOOKE if PPC32 -config FSL_85XX_CACHE_SRAM - bool - select PPC_LIB_RHEAP - help - When selected, this option enables cache-sram support - for memory allocation on P1/P2 QorIQ platforms. - cache-sram-size and cache-sram-offset kernel boot - parameters should be passed when this option is enabled. - config BSC9131_RDB bool "Freescale BSC9131RDB" select DEFAULT_UIMAGE diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index 026b3f01a991..9cb1d029511a 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_FSL_PMC) += fsl_pmc.o obj-$(CONFIG_FSL_CORENET_RCPM) += fsl_rcpm.o obj-$(CONFIG_FSL_LBC) += fsl_lbc.o obj-$(CONFIG_FSL_GTM) += fsl_gtm.o -obj-$(CONFIG_FSL_85XX_CACHE_SRAM) += fsl_85xx_l2ctlr.o fsl_85xx_cache_sram.o obj-$(CONFIG_FSL_RIO) += fsl_rio.o fsl_rmu.o obj-$(CONFIG_TSI108_BRIDGE) += tsi108_pci.o tsi108_dev.o obj-$(CONFIG_RTC_DRV_CMOS) += rtc_cmos_setup.o diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_ctlr.h b/arch/powerpc/sysdev/fsl_85xx_cache_ctlr.h deleted file mode 100644 index ce370749add9..000000000000 --- a/arch/powerpc/sysdev/fsl_85xx_cache_ctlr.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2009-2010, 2012 Freescale Semiconductor, Inc - * - * QorIQ based Cache Controller Memory Mapped Registers - * - * Author: Vivek Mahajan - */ - -#ifndef __FSL_85XX_CACHE_CTLR_H__ -#define __FSL_85XX_CACHE_CTLR_H__ - -#define L2CR_L2FI 0x40000000 /* L2 flash invalidate */ -#define L2CR_L2IO 0x00200000 /* L2 instruction only */ -#define L2CR_SRAM_ZERO 0x00000000 /* L2SRAM zero size */ -#define L2CR_SRAM_FULL 0x00010000 /* L2SRAM full size */ -#define L2CR_SRAM_HALF 0x00020000 /* L2SRAM half size */ -#define L2CR_SRAM_TWO_HALFS 0x00030000 /* L2SRAM two half sizes */ -#define L2CR_SRAM_QUART 0x00040000 /* L2SRAM one quarter size */ -#define L2CR_SRAM_TWO_QUARTS 0x00050000 /* L2SRAM two quarter size */ -#define L2CR_SRAM_EIGHTH 0x00060000 /* L2SRAM one eighth size */ -#define L2CR_SRAM_TWO_EIGHTH 0x00070000 /* L2SRAM two eighth size */ - -#define L2SRAM_OPTIMAL_SZ_SHIFT 0x00000003 /* Optimum size for L2SRAM */ - -#define L2SRAM_BAR_MSK_LO18 0xFFFFC000 /* Lower 18 bits */ -#define L2SRAM_BARE_MSK_HI4 0x0000000F /* Upper 4 bits */ - -enum cache_sram_lock_ways { - LOCK_WAYS_ZERO, - LOCK_WAYS_EIGHTH, - LOCK_WAYS_TWO_EIGHTH, - LOCK_WAYS_HALF = 4, - LOCK_WAYS_FULL = 8, -}; - -struct mpc85xx_l2ctlr { - u32 ctl; /* 0x000 - L2 control */ - u8 res1[0xC]; - u32 ewar0; /* 0x010 - External write address 0 */ - u32 ewarea0; /* 0x014 - External write address extended 0 */ - u32 ewcr0; /* 0x018 - External write ctrl */ - u8 res2[4]; - u32 ewar1; /* 0x020 - External write address 1 */ - u32 ewarea1; /* 0x024 - External write address extended 1 */ - u32 ewcr1; /* 0x028 - External write ctrl 1 */ - u8 res3[4]; - u32 ewar2; /* 0x030 - External write address 2 */ - u32 ewarea2; /* 0x034 - External write address extended 2 */ - u32 ewcr2; /* 0x038 - External write ctrl 2 */ - u8 res4[4]; - u32 ewar3; /* 0x040 - External write address 3 */ - u32 ewarea3; /* 0x044 - External write address extended 3 */ - u32 ewcr3; /* 0x048 - External write ctrl 3 */ - u8 res5[0xB4]; - u32 srbar0; /* 0x100 - SRAM base address 0 */ - u32 srbarea0; /* 0x104 - SRAM base addr reg ext address 0 */ - u32 srbar1; /* 0x108 - SRAM base address 1 */ - u32 srbarea1; /* 0x10C - SRAM base addr reg ext address 1 */ - u8 res6[0xCF0]; - u32 errinjhi; /* 0xE00 - Error injection mask high */ - u32 errinjlo; /* 0xE04 - Error injection mask low */ - u32 errinjctl; /* 0xE08 - Error injection tag/ecc control */ - u8 res7[0x14]; - u32 captdatahi; /* 0xE20 - Error data high capture */ - u32 captdatalo; /* 0xE24 - Error data low capture */ - u32 captecc; /* 0xE28 - Error syndrome */ - u8 res8[0x14]; - u32 errdet; /* 0xE40 - Error detect */ - u32 errdis; /* 0xE44 - Error disable */ - u32 errinten; /* 0xE48 - Error interrupt enable */ - u32 errattr; /* 0xE4c - Error attribute capture */ - u32 erradrrl; /* 0xE50 - Error address capture low */ - u32 erradrrh; /* 0xE54 - Error address capture high */ - u32 errctl; /* 0xE58 - Error control */ - u8 res9[0x1A4]; -}; - -struct sram_parameters { - unsigned int sram_size; - phys_addr_t sram_offset; -}; - -extern int instantiate_cache_sram(struct platform_device *dev, - struct sram_parameters sram_params); -extern void remove_cache_sram(struct platform_device *dev); - -#endif /* __FSL_85XX_CACHE_CTLR_H__ */ diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c deleted file mode 100644 index a3aeaa5f0f1b..000000000000 --- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2009-2010 Freescale Semiconductor, Inc. - * - * Simple memory allocator abstraction for QorIQ (P1/P2) based Cache-SRAM - * - * Author: Vivek Mahajan - * - * This file is derived from the original work done - * by Sylvain Munaut for the Bestcomm SRAM allocator. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "fsl_85xx_cache_ctlr.h" - -struct mpc85xx_cache_sram *cache_sram; - -void *mpc85xx_cache_sram_alloc(unsigned int size, - phys_addr_t *phys, unsigned int align) -{ - unsigned long offset; - unsigned long flags; - - if (unlikely(cache_sram == NULL)) - return NULL; - - if (!size || (size > cache_sram->size) || (align > cache_sram->size)) { - pr_err("%s(): size(=%x) or align(=%x) zero or too big\n", - __func__, size, align); - return NULL; - } - - if ((align & (align - 1)) || align <= 1) { - pr_err("%s(): align(=%x) must be power of two and >1\n", - __func__, align); - return NULL; - } - - spin_lock_irqsave(&cache_sram->lock, flags); - offset = rh_alloc_align(cache_sram->rh, size, align, NULL); - spin_unlock_irqrestore(&cache_sram->lock, flags); - - if (IS_ERR_VALUE(offset)) - return NULL; - - *phys = cache_sram->base_phys + offset; - - return (unsigned char *)cache_sram->base_virt + offset; -} -EXPORT_SYMBOL(mpc85xx_cache_sram_alloc); - -void mpc85xx_cache_sram_free(void *ptr) -{ - unsigned long flags; - BUG_ON(!ptr); - - spin_lock_irqsave(&cache_sram->lock, flags); - rh_free(cache_sram->rh, ptr - cache_sram->base_virt); - spin_unlock_irqrestore(&cache_sram->lock, flags); -} -EXPORT_SYMBOL(mpc85xx_cache_sram_free); - -int __init instantiate_cache_sram(struct platform_device *dev, - struct sram_parameters sram_params) -{ - int ret = 0; - - if (cache_sram) { - dev_err(&dev->dev, "Already initialized cache-sram\n"); - return -EBUSY; - } - - cache_sram = kzalloc(sizeof(struct mpc85xx_cache_sram), GFP_KERNEL); - if (!cache_sram) { - dev_err(&dev->dev, "Out of memory for cache_sram structure\n"); - return -ENOMEM; - } - - cache_sram->base_phys = sram_params.sram_offset; - cache_sram->size = sram_params.sram_size; - - if (!request_mem_region(cache_sram->base_phys, cache_sram->size, - "fsl_85xx_cache_sram")) { - dev_err(&dev->dev, "%pOF: request memory failed\n", - dev->dev.of_node); - ret = -ENXIO; - goto out_free; - } - - cache_sram->base_virt = ioremap_coherent(cache_sram->base_phys, - cache_sram->size); - if (!cache_sram->base_virt) { - dev_err(&dev->dev, "%pOF: ioremap_coherent failed\n", - dev->dev.of_node); - ret = -ENOMEM; - goto out_release; - } - - cache_sram->rh = rh_create(sizeof(unsigned int)); - if (IS_ERR(cache_sram->rh)) { - dev_err(&dev->dev, "%pOF: Unable to create remote heap\n", - dev->dev.of_node); - ret = PTR_ERR(cache_sram->rh); - goto out_unmap; - } - - rh_attach_region(cache_sram->rh, 0, cache_sram->size); - spin_lock_init(&cache_sram->lock); - - dev_info(&dev->dev, "[base:0x%llx, size:0x%x] configured and loaded\n", - (unsigned long long)cache_sram->base_phys, cache_sram->size); - - return 0; - -out_unmap: - iounmap(cache_sram->base_virt); - -out_release: - release_mem_region(cache_sram->base_phys, cache_sram->size); - -out_free: - kfree(cache_sram); - return ret; -} - -void remove_cache_sram(struct platform_device *dev) -{ - BUG_ON(!cache_sram); - - rh_detach_region(cache_sram->rh, 0, cache_sram->size); - rh_destroy(cache_sram->rh); - - iounmap(cache_sram->base_virt); - release_mem_region(cache_sram->base_phys, cache_sram->size); - - kfree(cache_sram); - cache_sram = NULL; - - dev_info(&dev->dev, "MPC85xx Cache-SRAM driver unloaded\n"); -} diff --git a/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c b/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c deleted file mode 100644 index 2d0af0c517bb..000000000000 --- a/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c +++ /dev/null @@ -1,216 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2009-2010, 2012 Freescale Semiconductor, Inc. - * - * QorIQ (P1/P2) L2 controller init for Cache-SRAM instantiation - * - * Author: Vivek Mahajan - */ - -#include -#include -#include -#include - -#include "fsl_85xx_cache_ctlr.h" - -static char *sram_size; -static char *sram_offset; -struct mpc85xx_l2ctlr __iomem *l2ctlr; - -static int get_cache_sram_params(struct sram_parameters *sram_params) -{ - unsigned long long addr; - unsigned int size; - - if (!sram_size || (kstrtouint(sram_size, 0, &size) < 0)) - return -EINVAL; - - if (!sram_offset || (kstrtoull(sram_offset, 0, &addr) < 0)) - return -EINVAL; - - sram_params->sram_offset = addr; - sram_params->sram_size = size; - - return 0; -} - -static int __init get_size_from_cmdline(char *str) -{ - if (!str) - return 0; - - sram_size = str; - return 1; -} - -static int __init get_offset_from_cmdline(char *str) -{ - if (!str) - return 0; - - sram_offset = str; - return 1; -} - -__setup("cache-sram-size=", get_size_from_cmdline); -__setup("cache-sram-offset=", get_offset_from_cmdline); - -static int mpc85xx_l2ctlr_of_probe(struct platform_device *dev) -{ - long rval; - unsigned int rem; - unsigned char ways; - const unsigned int *prop; - unsigned int l2cache_size; - struct sram_parameters sram_params; - - if (!dev->dev.of_node) { - dev_err(&dev->dev, "Device's OF-node is NULL\n"); - return -EINVAL; - } - - prop = of_get_property(dev->dev.of_node, "cache-size", NULL); - if (!prop) { - dev_err(&dev->dev, "Missing L2 cache-size\n"); - return -EINVAL; - } - l2cache_size = *prop; - - if (get_cache_sram_params(&sram_params)) - return 0; /* fall back to L2 cache only */ - - rem = l2cache_size % sram_params.sram_size; - ways = LOCK_WAYS_FULL * sram_params.sram_size / l2cache_size; - if (rem || (ways & (ways - 1))) { - dev_err(&dev->dev, "Illegal cache-sram-size in command line\n"); - return -EINVAL; - } - - l2ctlr = of_iomap(dev->dev.of_node, 0); - if (!l2ctlr) { - dev_err(&dev->dev, "Can't map L2 controller\n"); - return -EINVAL; - } - - /* - * Write bits[0-17] to srbar0 - */ - out_be32(&l2ctlr->srbar0, - lower_32_bits(sram_params.sram_offset) & L2SRAM_BAR_MSK_LO18); - - /* - * Write bits[18-21] to srbare0 - */ -#ifdef CONFIG_PHYS_64BIT - out_be32(&l2ctlr->srbarea0, - upper_32_bits(sram_params.sram_offset) & L2SRAM_BARE_MSK_HI4); -#endif - - clrsetbits_be32(&l2ctlr->ctl, L2CR_L2E, L2CR_L2FI); - - switch (ways) { - case LOCK_WAYS_EIGHTH: - setbits32(&l2ctlr->ctl, - L2CR_L2E | L2CR_L2FI | L2CR_SRAM_EIGHTH); - break; - - case LOCK_WAYS_TWO_EIGHTH: - setbits32(&l2ctlr->ctl, - L2CR_L2E | L2CR_L2FI | L2CR_SRAM_QUART); - break; - - case LOCK_WAYS_HALF: - setbits32(&l2ctlr->ctl, - L2CR_L2E | L2CR_L2FI | L2CR_SRAM_HALF); - break; - - case LOCK_WAYS_FULL: - default: - setbits32(&l2ctlr->ctl, - L2CR_L2E | L2CR_L2FI | L2CR_SRAM_FULL); - break; - } - eieio(); - - rval = instantiate_cache_sram(dev, sram_params); - if (rval < 0) { - dev_err(&dev->dev, "Can't instantiate Cache-SRAM\n"); - iounmap(l2ctlr); - return -EINVAL; - } - - return 0; -} - -static int mpc85xx_l2ctlr_of_remove(struct platform_device *dev) -{ - BUG_ON(!l2ctlr); - - iounmap(l2ctlr); - remove_cache_sram(dev); - dev_info(&dev->dev, "MPC85xx L2 controller unloaded\n"); - - return 0; -} - -static const struct of_device_id mpc85xx_l2ctlr_of_match[] = { - { - .compatible = "fsl,p2020-l2-cache-controller", - }, - { - .compatible = "fsl,p2010-l2-cache-controller", - }, - { - .compatible = "fsl,p1020-l2-cache-controller", - }, - { - .compatible = "fsl,p1011-l2-cache-controller", - }, - { - .compatible = "fsl,p1013-l2-cache-controller", - }, - { - .compatible = "fsl,p1022-l2-cache-controller", - }, - { - .compatible = "fsl,mpc8548-l2-cache-controller", - }, - { .compatible = "fsl,mpc8544-l2-cache-controller",}, - { .compatible = "fsl,mpc8572-l2-cache-controller",}, - { .compatible = "fsl,mpc8536-l2-cache-controller",}, - { .compatible = "fsl,p1021-l2-cache-controller",}, - { .compatible = "fsl,p1012-l2-cache-controller",}, - { .compatible = "fsl,p1025-l2-cache-controller",}, - { .compatible = "fsl,p1016-l2-cache-controller",}, - { .compatible = "fsl,p1024-l2-cache-controller",}, - { .compatible = "fsl,p1015-l2-cache-controller",}, - { .compatible = "fsl,p1010-l2-cache-controller",}, - { .compatible = "fsl,bsc9131-l2-cache-controller",}, - {}, -}; - -static struct platform_driver mpc85xx_l2ctlr_of_platform_driver = { - .driver = { - .name = "fsl-l2ctlr", - .of_match_table = mpc85xx_l2ctlr_of_match, - }, - .probe = mpc85xx_l2ctlr_of_probe, - .remove = mpc85xx_l2ctlr_of_remove, -}; - -static __init int mpc85xx_l2ctlr_of_init(void) -{ - return platform_driver_register(&mpc85xx_l2ctlr_of_platform_driver); -} - -static void __exit mpc85xx_l2ctlr_of_exit(void) -{ - platform_driver_unregister(&mpc85xx_l2ctlr_of_platform_driver); -} - -subsys_initcall(mpc85xx_l2ctlr_of_init); -module_exit(mpc85xx_l2ctlr_of_exit); - -MODULE_DESCRIPTION("Freescale MPC85xx L2 controller init"); -MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 48b63961c84671df4c4a4451c40057e34b26df1a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 11 Apr 2022 09:49:34 +0200 Subject: powerpc/numa: Associate numa node to its cpu earlier powerpc is the only platform that do not rely on cpu_up()->try_online_node() to bring up a numa node, and special cases it, instead, deep in its own machinery: dlpar_online_cpu find_and_online_cpu_nid try_online_node This should not be needed, but the thing is that the try_online_node() from cpu_up() will not apply on the right node, because cpu_to_node() will return the old mapping numa<->cpu that gets set on boot stage for all possible cpus. That can be seen easily if we try to print out the numa node passed to try_online_node() in cpu_up(). The thing is that the numa<->cpu mapping does not get updated till a much later stage in start_secondary: start_secondary: set_numa_node(numa_cpu_lookup_table[cpu]) But we do not really care, as we already now the CPU <-> NUMA associativity back in find_and_online_cpu_nid(), so let us make use of that and set the proper numa<->cpu mapping, so cpu_to_node() in cpu_up() returns the right node and try_online_node() can do its work. Signed-off-by: Oscar Salvador Tested-by: Geetika Moolchandani Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220411074934.4632-1-osalvador@suse.de --- arch/powerpc/include/asm/topology.h | 8 ++----- arch/powerpc/mm/numa.c | 32 +++++++--------------------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +- 3 files changed, 11 insertions(+), 31 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 36fcafb1fd6d..8a4d4f4d9749 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -111,14 +111,10 @@ static inline void unmap_cpu_from_node(unsigned long cpu) {} #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) -extern int find_and_online_cpu_nid(int cpu); +void find_and_update_cpu_nid(int cpu); extern int cpu_to_coregroup_id(int cpu); #else -static inline int find_and_online_cpu_nid(int cpu) -{ - return 0; -} - +static inline void find_and_update_cpu_nid(int cpu) {} static inline int cpu_to_coregroup_id(int cpu) { #ifdef CONFIG_SMP diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4680e310e68a..0801b2ce9b7d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1422,42 +1422,26 @@ out: return rc; } -int find_and_online_cpu_nid(int cpu) +void find_and_update_cpu_nid(int cpu) { __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; int new_nid; /* Use associativity from first thread for all siblings */ if (vphn_get_associativity(cpu, associativity)) - return cpu_to_node(cpu); + return; + /* Do not have previous associativity, so find it now. */ new_nid = associativity_to_nid(associativity); - if (new_nid < 0 || !node_possible(new_nid)) - new_nid = first_online_node; - if (!node_online(new_nid)) { -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Need to ensure that NODE_DATA is initialized for a node from - * available memory (see memblock_alloc_try_nid). If unable to - * init the node, then default to nearest node that has memory - * installed. Skip onlining a node if the subsystems are not - * yet initialized. - */ - if (!topology_inited || try_online_node(new_nid)) - new_nid = first_online_node; -#else - /* - * Default to using the nearest node that has memory installed. - * Otherwise, it would be necessary to patch the kernel MM code - * to deal with more memoryless-node error conditions. - */ + if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; -#endif - } + else + // Associate node <-> cpu, so cpu_up() calls + // try_online_node() on the right node. + set_cpu_numa_node(cpu, new_nid); pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); - return new_nid; } int cpu_to_coregroup_id(int cpu) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index b81fc846d99c..0f8cd8b06432 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -398,7 +398,7 @@ static int dlpar_online_cpu(struct device_node *dn) if (get_hard_smp_processor_id(cpu) != thread) continue; cpu_maps_update_done(); - find_and_online_cpu_nid(cpu); + find_and_update_cpu_nid(cpu); rc = device_online(get_cpu_device(cpu)); if (rc) { dlpar_offline_cpu(dn); -- cgit v1.2.3 From 3e36960a27fec30f16bace1521dc852105522f5e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 May 2022 14:30:56 +1000 Subject: powerpc/64s: Add CPU_FTRS_POWER9_DD2_2 to CPU_FTRS_ALWAYS mask CPU_FTRS_POWER9_DD2_2 is missing from CPU_FTRS_ALWAYS. That doesn't cause any bug, because CPU_FTRS_POWER9_DD2_2 adds new bits that don't appear in other values, so when anded with the other masks the result is the same. But for consistency we should have all values in the CPU_FTRS_ALWAYS mask, so that the logic is robust against the values being changed in future. Fixes: b5af4f279323 ("powerpc: Add CPU feature bits for TM bug workarounds on POWER9 v2.2") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220519122205.746276-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/cputable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index e85c849214a2..4457052d7450 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -541,14 +541,14 @@ enum { #define CPU_FTRS_ALWAYS \ (CPU_FTRS_POSSIBLE & ~CPU_FTR_HVMODE & CPU_FTRS_POWER7 & \ CPU_FTRS_POWER8E & CPU_FTRS_POWER8 & CPU_FTRS_POWER9 & \ - CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & CPU_FTRS_DT_CPU_BASE) #else #define CPU_FTRS_ALWAYS \ (CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \ CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \ CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \ ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE & CPU_FTRS_POWER9 & \ - CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & CPU_FTRS_DT_CPU_BASE) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else -- cgit v1.2.3 From b4d9cc75721bdbceba0d71aa5accf6aa09ee26c1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 May 2022 15:03:57 +1000 Subject: powerpc/64s: Add CPU_FTRS_POWER10 to ALWAYS mask CPU_FTRS_POWER10 is missing from the CPU_FTRS_ALWAYS mask. Currently that doesn't cause any bug, because it is a superset of the POWER9 mask, which the exception of CPU_FTR_TM, but POWER7 doesn't have CPU_FTR_TM, so CPU_FTR_TM is not in the ALWAYS mask to begin with. However for consistency, and to be robust against future changes, it should be included in the ALWAYS mask. Fixes: a3ea40d5c736 ("powerpc: Add POWER10 architected mode") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220519122205.746276-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/cputable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 4457052d7450..48a55cb1bf5c 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -541,14 +541,16 @@ enum { #define CPU_FTRS_ALWAYS \ (CPU_FTRS_POSSIBLE & ~CPU_FTR_HVMODE & CPU_FTRS_POWER7 & \ CPU_FTRS_POWER8E & CPU_FTRS_POWER8 & CPU_FTRS_POWER9 & \ - CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & \ + CPU_FTRS_POWER10 & CPU_FTRS_DT_CPU_BASE) #else #define CPU_FTRS_ALWAYS \ (CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \ CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \ CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \ ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE & CPU_FTRS_POWER9 & \ - CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & \ + CPU_FTRS_POWER10 & CPU_FTRS_DT_CPU_BASE) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else -- cgit v1.2.3 From 26b78c81e84c5133b299fb11bf17ec1a3d2ad217 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Tue, 3 May 2022 12:01:52 -0500 Subject: powerpc: Enable the DAWR on POWER9 DD2.3 and above The hardware bug in POWER9 preventing use of the DAWR was fixed in DD2.3. Set the CPU_FTR_DAWR feature bit on these newer systems to start using it again, and update the documentation accordingly. The CPU features for DD2.3 are currently determined by "DD2.2 or later" logic. In adding DD2.3 as a discrete case for the first time here, I'm carrying the quirks of DD2.2 forward to keep all behavior outside of this DAWR change the same. This leaves the assessment and potential removal of those quirks on DD2.3 for later. Signed-off-by: Reza Arbab Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220503170152.23412-1-arbab@linux.ibm.com --- Documentation/powerpc/dawr-power9.rst | 26 +++++++++++++++++--------- arch/powerpc/include/asm/cputable.h | 10 ++++++++-- arch/powerpc/kernel/cputable.c | 22 ++++++++++++++++++++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 8 +++++++- 4 files changed, 52 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/include') diff --git a/Documentation/powerpc/dawr-power9.rst b/Documentation/powerpc/dawr-power9.rst index e55ac6a24b97..310f2e0cea81 100644 --- a/Documentation/powerpc/dawr-power9.rst +++ b/Documentation/powerpc/dawr-power9.rst @@ -2,15 +2,23 @@ DAWR issues on POWER9 ===================== -On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop -if it points to cache inhibited (CI) memory. Currently Linux has no way to -distinguish CI memory when configuring the DAWR, so (for now) the DAWR is -disabled by this commit:: - - commit 9654153158d3e0684a1bdb76dbababdb7111d5a0 - Author: Michael Neuling - Date: Tue Mar 27 15:37:24 2018 +1100 - powerpc: Disable DAWR in the base POWER9 CPU features +On older POWER9 processors, the Data Address Watchpoint Register (DAWR) can +cause a checkstop if it points to cache inhibited (CI) memory. Currently Linux +has no way to distinguish CI memory when configuring the DAWR, so on affected +systems, the DAWR is disabled. + +Affected processor revisions +============================ + +This issue is only present on processors prior to v2.3. The revision can be +found in /proc/cpuinfo:: + + processor : 0 + cpu : POWER9, altivec supported + clock : 3800.000000MHz + revision : 2.3 (pvr 004e 1203) + +On a system with the issue, the DAWR is disabled as detailed below. Technical Details: ================== diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 48a55cb1bf5c..549eb6dd146f 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -440,6 +440,10 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ CPU_FTR_P9_TM_HV_ASSIST | \ CPU_FTR_P9_TM_XER_SO_BUG) +#define CPU_FTRS_POWER9_DD2_3 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ + CPU_FTR_P9_TM_HV_ASSIST | \ + CPU_FTR_P9_TM_XER_SO_BUG | \ + CPU_FTR_DAWR) #define CPU_FTRS_POWER10 (CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -469,14 +473,16 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTRS_POSSIBLE \ (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \ - CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10) + CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \ + CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10) #else #define CPU_FTRS_POSSIBLE \ (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \ CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \ - CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10) + CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \ + CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index f9b3def5b254..a5dbfccd2047 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -487,11 +487,29 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, - { /* Power9 DD2.2 or later */ + { /* Power9 DD2.2 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0202, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_2, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD2.3 or later */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD2_2, + .cpu_features = CPU_FTRS_POWER9_DD2_3, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index c8e147b66d34..2ad365c21afa 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -774,20 +774,26 @@ static __init void cpufeatures_cpu_quirks(void) if ((version & 0xffffefff) == 0x004e0200) { /* DD2.0 has no feature flag */ cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); } else if ((version & 0xffffefff) == 0x004e0201) { cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); } else if ((version & 0xffffefff) == 0x004e0202) { cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); + } else if ((version & 0xffffefff) == 0x004e0203) { + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; } else if ((version & 0xffff0000) == 0x004e0000) { /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; } if ((version & 0xffff0000) == 0x004e0000) { - cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } -- cgit v1.2.3 From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- MAINTAINERS | 2 -- arch/powerpc/include/asm/livepatch.h | 10 +--------- arch/powerpc/kernel/irq.c | 1 - arch/powerpc/kernel/setup_64.c | 2 +- arch/s390/include/asm/livepatch.h | 22 ---------------------- arch/x86/include/asm/livepatch.h | 20 -------------------- include/linux/livepatch.h | 2 -- kernel/livepatch/patch.c | 2 +- 8 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 arch/s390/include/asm/livepatch.h delete mode 100644 arch/x86/include/asm/livepatch.h (limited to 'arch/powerpc/include') diff --git a/MAINTAINERS b/MAINTAINERS index 741825cff43c..589c61412e3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11338,8 +11338,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: arch/s390/include/asm/livepatch.h -F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: lib/livepatch/ diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 1c60094ea0cd..d044a1fd4f44 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -7,17 +7,9 @@ #ifndef _ASM_POWERPC_LIVEPATCH_H #define _ASM_POWERPC_LIVEPATCH_H -#include -#include +#include #include -#ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} -#endif /* CONFIG_LIVEPATCH */ - #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 752fb182eacb..fb99e3fa034d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e547066a06aa..2c1e9a5ac6ca 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 5209f223331a..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); -- cgit v1.2.3 From c85ab4fe33065ca1fdcac26f0c00c837fe727ba7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 May 2022 07:42:05 +0200 Subject: powerpc/64s: Only set HAVE_ARCH_UNMAPPED_AREA when CONFIG_PPC_64S_HASH_MMU is set When CONFIG_PPC_64S_HASH_MMU is not set, slice.c is not built and arch_get_unmapped_area() and arch_get_unmapped_area_topdown() are not provided because RADIX uses the generic ones. Therefore, neither set HAVE_ARCH_UNMAPPED_AREA nor HAVE_ARCH_UNMAPPED_AREA_TOPDOWN. Fixes: ab57bd7570d4 ("powerpc/mm: Move get_unmapped_area functions to slice.c") Reported-by: Laurent Dufour Signed-off-by: Christophe Leroy Tested-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e438c6cc09f94085e56733ed2d6e84333c35292a.1653370913.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/slice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index b8eb4ad271b9..5fbe18544cbd 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -4,11 +4,13 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_HUGETLB_PAGE #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +#endif #define SLICE_LOW_SHIFT 28 #define SLICE_LOW_TOP (0x100000000ul) -- cgit v1.2.3 From dcf280e6f80be280ca7dd1b058f038654e4a18dd Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 27 May 2022 21:15:41 +1000 Subject: powerpc/64: Include cache.h directly in paca.h paca.h uses ____cacheline_aligned without directly including cache.h, where it's defined. For Book3S builds that's OK because paca.h includes lppaca.h, and it does include cache.h. But Book3E builds have been getting cache.h indirectly via printk.h, which is dicey, and in fact that include was recently removed, leading to build errors such as: ld: fs/isofs/dir.o:(.bss+0x0): multiple definition of `____cacheline_aligned'; fs/isofs/namei.o:(.bss+0x0): first defined here So include cache.h directly to fix the build error. Fixes: 534aa1dc975a ("printk: stop including cache.h from printk.h") Reported-by: Guenter Roeck Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 03330b7d835f..4d7aaab82702 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -12,6 +12,7 @@ #ifdef CONFIG_PPC64 +#include #include #include #include -- cgit v1.2.3 From 3e8635fb2e072672cbc650989ffedf8300ad67fb Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 2 Jun 2022 00:31:14 +1000 Subject: powerpc/kasan: Force thread size increase with KASAN KASAN causes increased stack usage, which can lead to stack overflows. The logic in Kconfig to suggest a larger default doesn't work if a user has CONFIG_EXPERT enabled and has an existing .config with a smaller value. Follow the lead of x86 and arm64, and force the thread size to be increased when KASAN is enabled. That also has the effect of enlarging the stack for 64-bit KASAN builds, which is also desirable. Fixes: edbadaf06710 ("powerpc/kasan: Fix stack overflow by increasing THREAD_SHIFT") Reported-by: Erhard Furtner Reported-by: Christophe Leroy [mpe: Use MIN_THREAD_SHIFT as suggested by Christophe] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220601143114.133524-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/thread_info.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 54dbbb1d4b36..b1760d615bb7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -790,7 +790,6 @@ config THREAD_SHIFT range 13 15 default "15" if PPC_256K_PAGES default "14" if PPC64 - default "14" if KASAN default "13" help Used to define the stack size. The default is almost always what you diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 125328d1b980..af58f1ed3952 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -14,10 +14,16 @@ #ifdef __KERNEL__ -#if defined(CONFIG_VMAP_STACK) && CONFIG_THREAD_SHIFT < PAGE_SHIFT +#ifdef CONFIG_KASAN +#define MIN_THREAD_SHIFT (CONFIG_THREAD_SHIFT + 1) +#else +#define MIN_THREAD_SHIFT CONFIG_THREAD_SHIFT +#endif + +#if defined(CONFIG_VMAP_STACK) && MIN_THREAD_SHIFT < PAGE_SHIFT #define THREAD_SHIFT PAGE_SHIFT #else -#define THREAD_SHIFT CONFIG_THREAD_SHIFT +#define THREAD_SHIFT MIN_THREAD_SHIFT #endif #define THREAD_SIZE (1 << THREAD_SHIFT) -- cgit v1.2.3 From d39e06154024b08a856767a70b7f8215e0af6ace Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 2 Jun 2022 03:19:40 +0900 Subject: powerpc: use __kernel_{uid,gid}32_t in uapi/asm/stat.h Commit c01013a2f8dd ("powerpc: add asm/stat.h to UAPI compile-test coverage") converted as follows: uid_t --> __kernel_uid_t gid_t --> __kernel_gid_t The bit width of __kernel_{uid,gid}_t is 16 or 32-bits depending on architectures. PPC uses 32-bits for them as in include/uapi/asm-generic/posix_types.h, so the previous conversion is probably fine, but let's stick to the arch-independent conversion just in case. The safe replacements across all architectures are: uid_t --> __kernel_uid32_t gid_t --> __kernel_gid32_t as defined in include/linux/types.h. A similar issue was reported for the android binder. [1] [1]: https://lore.kernel.org/all/20220601010017.2639048-1-cmllamas@google.com/ Fixes: c01013a2f8dd ("powerpc: add asm/stat.h to UAPI compile-test coverage") Signed-off-by: Masahiro Yamada Signed-off-by: Arnd Bergmann --- arch/powerpc/include/uapi/asm/stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/uapi/asm/stat.h b/arch/powerpc/include/uapi/asm/stat.h index a28c9a1201fa..d50901664239 100644 --- a/arch/powerpc/include/uapi/asm/stat.h +++ b/arch/powerpc/include/uapi/asm/stat.h @@ -37,8 +37,8 @@ struct stat { __kernel_mode_t st_mode; unsigned short st_nlink; #endif - __kernel_uid_t st_uid; - __kernel_gid_t st_gid; + __kernel_uid32_t st_uid; + __kernel_gid32_t st_gid; unsigned long st_rdev; long st_size; unsigned long st_blksize; -- cgit v1.2.3 From 0e3c3b901c00364198d31482fa2552ccf2d5c899 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Jun 2022 18:42:59 -0400 Subject: No need of likely/unlikely on calls of check_copy_size() it's inline and unlikely() inside of it (including the implicit one in WARN_ON_ONCE()) suffice to convince the compiler that getting false from check_copy_size() is unlikely. Spotted-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- arch/powerpc/include/asm/uaccess.h | 2 +- arch/s390/include/asm/uaccess.h | 4 ++-- include/linux/uaccess.h | 4 ++-- include/linux/uio.h | 15 ++++++--------- 4 files changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 9b82b38ff867..105f200b1e31 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -348,7 +348,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned long size) static inline unsigned long __must_check copy_mc_to_user(void __user *to, const void *from, unsigned long n) { - if (likely(check_copy_size(from, n, true))) { + if (check_copy_size(from, n, true)) { if (access_ok(to, n)) { allow_write_to_user(to, n); n = copy_mc_generic((void *)to, from, n); diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index f4511e21d646..c2c9995466e0 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -39,7 +39,7 @@ _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned static __always_inline unsigned long __must_check copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) { - if (likely(check_copy_size(to, n, false))) + if (check_copy_size(to, n, false)) n = _copy_from_user_key(to, from, n, key); return n; } @@ -50,7 +50,7 @@ _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned l static __always_inline unsigned long __must_check copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) { - if (likely(check_copy_size(from, n, true))) + if (check_copy_size(from, n, true)) n = _copy_to_user_key(to, from, n, key); return n; } diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5a328cf02b75..47e5d374c7eb 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -148,7 +148,7 @@ _copy_to_user(void __user *, const void *, unsigned long); static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (likely(check_copy_size(to, n, false))) + if (check_copy_size(to, n, false)) n = _copy_from_user(to, from, n); return n; } @@ -156,7 +156,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (likely(check_copy_size(from, n, true))) + if (check_copy_size(from, n, true)) n = _copy_to_user(to, from, n); return n; } diff --git a/include/linux/uio.h b/include/linux/uio.h index 739285fe5a2f..76d305f3d4c2 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -156,19 +156,17 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { - if (unlikely(!check_copy_size(addr, bytes, true))) - return 0; - else + if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); + return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { - if (unlikely(!check_copy_size(addr, bytes, false))) - return 0; - else + if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); + return 0; } static __always_inline __must_check @@ -184,10 +182,9 @@ bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { - if (unlikely(!check_copy_size(addr, bytes, false))) - return 0; - else + if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); + return 0; } static __always_inline __must_check -- cgit v1.2.3 From 113fe88eed53af08800f54a03e463636105831e0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 11 Jun 2022 18:55:15 +0200 Subject: powerpc: Don't include asm/setup.h in asm/machdep.h asm/machdep.h doesn't need asm/setup.h Remove it. Add it directly in files that needs it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3b1dfb19a2c3265fb4abc2bfc7b6eae9261a998b.1654966508.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/kernel/pci-common.c | 1 + arch/powerpc/kernel/prom.c | 2 +- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/kexec/core.c | 1 + arch/powerpc/kvm/powerpc.c | 1 + arch/powerpc/mm/mem.c | 1 + arch/powerpc/platforms/pseries/kexec.c | 2 +- arch/powerpc/platforms/pseries/lpar.c | 2 +- arch/powerpc/platforms/pseries/setup.c | 1 + arch/powerpc/sysdev/fsl_pci.c | 1 + drivers/scsi/mesh.c | 2 +- 12 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 358d171ae8e0..613755afa8a9 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -8,8 +8,6 @@ #include #include -#include - struct pt_regs; struct pci_bus; struct device_node; diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 068410cd54a3..c87999289752 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "../../../drivers/pci/pci.h" diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index feae8509b59c..1066b072db35 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 04694ec423f6..1939683e35ed 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 7ab4980fe13a..9773dd0640f3 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -19,6 +19,7 @@ #include #include #include +#include void machine_kexec_mask_interrupts(void) { unsigned int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 191992fcb2c2..fb1490761c87 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include #endif #include +#include #include "timing.h" #include "irq.h" diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 52b77684acda..2cf6748755e1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -25,6 +25,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index ab6cdbebb35e..096d09ed89f6 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 937f9c010b22..e6c117fb6491 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index afb074269b42..e86a749173e6 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -72,6 +72,7 @@ #include #include #include +#include #include "pseries.h" diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 1011cfea2e32..e8d072e98b66 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 322d3ad38159..a74f0c2c8ba7 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 7dc3ba0a071892ea212f90f63738fd9f81b1f638 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 11 Jun 2022 18:55:16 +0200 Subject: powerpc: Move prom_init() out of asm-prototypes.h This is the end of the work started with commit 76222808fc25 ("powerpc: Move C prototypes out of asm-prototypes.h") Now that asm/machdep.h doesn't include asm/setup.h anymore, there are no conflicts anymore with the function prom_init() defined in drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowrom.o So we can move it to asm/setup.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e111e4f0addb0fa810d5f6a71d3b8e62c0b53492.1654966508.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-prototypes.h | 6 ------ arch/powerpc/include/asm/setup.h | 5 +++++ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index d995c65d18ab..3b2bbc273055 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -34,12 +34,6 @@ int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode, uint64_t msr); -/* prom_init (OpenFirmware) */ -unsigned long __init prom_init(unsigned long r3, unsigned long r4, - unsigned long pp, - unsigned long r6, unsigned long r7, - unsigned long kbase); - /* misc runtime */ extern u64 __bswapdi2(u64); extern s64 __lshrdi3(s64, int); diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 8fa37ef5da4d..14a0b8af738e 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -85,6 +85,11 @@ void __init machine_init(u64 dt_ptr); void __init early_setup(unsigned long dt_ptr); void early_setup_secondary(void); +/* prom_init (OpenFirmware) */ +unsigned long __init prom_init(unsigned long r3, unsigned long r4, + unsigned long pp, unsigned long r6, + unsigned long r7, unsigned long kbase); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ -- cgit v1.2.3 From 6d056b7254f9954522b7bb9947c8779a013d189f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 7 May 2022 13:01:44 +0300 Subject: powerpc/52xx: Remove dead code, i.e. mpc52xx_get_xtal_freq() It seems mpc52xx_get_xtal_freq() is not used anywhere. Remove dead code. Signed-off-by: Andy Shevchenko Reviewed-by: Wolfram Sang Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220507100147.5802-1-andriy.shevchenko@linux.intel.com --- arch/powerpc/include/asm/mpc52xx.h | 1 - arch/powerpc/platforms/52xx/mpc52xx_common.c | 37 ---------------------------- 2 files changed, 38 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h index ce1e0aabaa64..ddd80aae1e32 100644 --- a/arch/powerpc/include/asm/mpc52xx.h +++ b/arch/powerpc/include/asm/mpc52xx.h @@ -274,7 +274,6 @@ extern void mpc52xx_declare_of_platform_devices(void); extern int mpc5200_psc_ac97_gpio_reset(int psc_number); extern void mpc52xx_map_common_devices(void); extern int mpc52xx_set_psc_clkdiv(int psc_id, int clkdiv); -extern unsigned int mpc52xx_get_xtal_freq(struct device_node *node); extern void __noreturn mpc52xx_restart(char *cmd); /* mpc52xx_gpt.c */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_common.c b/arch/powerpc/platforms/52xx/mpc52xx_common.c index 4348506d667d..409c0ec06265 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_common.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_common.c @@ -203,43 +203,6 @@ int mpc52xx_set_psc_clkdiv(int psc_id, int clkdiv) } EXPORT_SYMBOL(mpc52xx_set_psc_clkdiv); -/** - * mpc52xx_get_xtal_freq - Get SYS_XTAL_IN frequency for a device - * - * @node: device node - * - * Returns the frequency of the external oscillator clock connected - * to the SYS_XTAL_IN pin, or 0 if it cannot be determined. - */ -unsigned int mpc52xx_get_xtal_freq(struct device_node *node) -{ - u32 val; - unsigned int freq; - - if (!mpc52xx_cdm) - return 0; - - freq = mpc5xxx_get_bus_frequency(node); - if (!freq) - return 0; - - if (in_8(&mpc52xx_cdm->ipb_clk_sel) & 0x1) - freq *= 2; - - val = in_be32(&mpc52xx_cdm->rstcfg); - if (val & (1 << 5)) - freq *= 8; - else - freq *= 4; - if (val & (1 << 6)) - freq /= 12; - else - freq /= 16; - - return freq; -} -EXPORT_SYMBOL(mpc52xx_get_xtal_freq); - /** * mpc52xx_restart: ppc_md->restart hook for mpc5200 using the watchdog timer */ -- cgit v1.2.3 From de06fba62af64144aca6f8a8bedbc848d2e5b440 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 7 May 2022 13:01:45 +0300 Subject: powerpc/mpc5xxx: Switch mpc5xxx_get_bus_frequency() to use fwnode Switch mpc5xxx_get_bus_frequency() to use fwnode in order to help cleaning up other parts of the kernel from OF specific code. No functional change intended. Signed-off-by: Andy Shevchenko Acked-by: Chris Packham # for i2c-mpc Acked-by: Wolfram Sang # for the I2C part Acked-by: Mark Brown Acked-by: Marc Kleine-Budde # for mscan/mpc5xxx_can Acked-by: Damien Le Moal Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220507100147.5802-2-andriy.shevchenko@linux.intel.com --- arch/powerpc/include/asm/mpc5xxx.h | 9 +++++- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 +- arch/powerpc/sysdev/mpc5xxx_clocks.c | 41 +++++++++++++----------- drivers/ata/pata_mpc52xx.c | 2 +- drivers/i2c/busses/i2c-mpc.c | 7 ++-- drivers/net/can/mscan/mpc5xxx_can.c | 2 +- drivers/net/ethernet/freescale/fec_mpc52xx.c | 2 +- drivers/net/ethernet/freescale/fec_mpc52xx_phy.c | 3 +- drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 4 +-- drivers/spi/spi-mpc52xx.c | 2 +- drivers/tty/serial/mpc52xx_uart.c | 4 +-- 11 files changed, 44 insertions(+), 34 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mpc5xxx.h b/arch/powerpc/include/asm/mpc5xxx.h index 2f60f5c5461b..44db26380435 100644 --- a/arch/powerpc/include/asm/mpc5xxx.h +++ b/arch/powerpc/include/asm/mpc5xxx.h @@ -11,7 +11,14 @@ #ifndef __ASM_POWERPC_MPC5xxx_H__ #define __ASM_POWERPC_MPC5xxx_H__ -extern unsigned long mpc5xxx_get_bus_frequency(struct device_node *node); +#include + +unsigned long mpc5xxx_fwnode_get_bus_frequency(struct fwnode_handle *fwnode); + +static inline unsigned long mpc5xxx_get_bus_frequency(struct device *dev) +{ + return mpc5xxx_fwnode_get_bus_frequency(dev_fwnode(dev)); +} #endif /* __ASM_POWERPC_MPC5xxx_H__ */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 968f5b727273..76ece6001cdf 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -722,7 +722,7 @@ static int mpc52xx_gpt_probe(struct platform_device *ofdev) raw_spin_lock_init(&gpt->lock); gpt->dev = &ofdev->dev; - gpt->ipb_freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node); + gpt->ipb_freq = mpc5xxx_get_bus_frequency(&ofdev->dev); gpt->regs = of_iomap(ofdev->dev.of_node, 0); if (!gpt->regs) return -ENOMEM; diff --git a/arch/powerpc/sysdev/mpc5xxx_clocks.c b/arch/powerpc/sysdev/mpc5xxx_clocks.c index 834a6d7fbd88..c5bf7e1b3780 100644 --- a/arch/powerpc/sysdev/mpc5xxx_clocks.c +++ b/arch/powerpc/sysdev/mpc5xxx_clocks.c @@ -1,31 +1,34 @@ // SPDX-License-Identifier: GPL-2.0 -/** - * mpc5xxx_get_bus_frequency - Find the bus frequency for a device - * @node: device node - * - * Returns bus frequency (IPS on MPC512x, IPB on MPC52xx), - * or 0 if the bus frequency cannot be found. - */ #include -#include #include +#include + #include -unsigned long mpc5xxx_get_bus_frequency(struct device_node *node) +/** + * mpc5xxx_fwnode_get_bus_frequency - Find the bus frequency for a firmware node + * @fwnode: firmware node + * + * Returns bus frequency (IPS on MPC512x, IPB on MPC52xx), + * or 0 if the bus frequency cannot be found. + */ +unsigned long mpc5xxx_fwnode_get_bus_frequency(struct fwnode_handle *fwnode) { - const unsigned int *p_bus_freq = NULL; + struct fwnode_handle *parent; + u32 bus_freq; + int ret; - of_node_get(node); - while (node) { - p_bus_freq = of_get_property(node, "bus-frequency", NULL); - if (p_bus_freq) - break; + ret = fwnode_property_read_u32(fwnode, "bus-frequency", &bus_freq); + if (!ret) + return bus_freq; - node = of_get_next_parent(node); + fwnode_for_each_parent_node(fwnode, parent) { + ret = fwnode_property_read_u32(parent, "bus-frequency", &bus_freq); + if (!ret) + return bus_freq; } - of_node_put(node); - return p_bus_freq ? *p_bus_freq : 0; + return 0; } -EXPORT_SYMBOL(mpc5xxx_get_bus_frequency); +EXPORT_SYMBOL(mpc5xxx_fwnode_get_bus_frequency); diff --git a/drivers/ata/pata_mpc52xx.c b/drivers/ata/pata_mpc52xx.c index 03b6ae37a578..6559b606736d 100644 --- a/drivers/ata/pata_mpc52xx.c +++ b/drivers/ata/pata_mpc52xx.c @@ -683,7 +683,7 @@ static int mpc52xx_ata_probe(struct platform_device *op) struct bcom_task *dmatsk; /* Get ipb frequency */ - ipb_freq = mpc5xxx_get_bus_frequency(op->dev.of_node); + ipb_freq = mpc5xxx_get_bus_frequency(&op->dev); if (!ipb_freq) { dev_err(&op->dev, "could not determine IPB bus frequency\n"); return -ENODEV; diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c index 6c698c10d3cd..81ac92bb4f6f 100644 --- a/drivers/i2c/busses/i2c-mpc.c +++ b/drivers/i2c/busses/i2c-mpc.c @@ -239,6 +239,7 @@ static const struct mpc_i2c_divider mpc_i2c_dividers_52xx[] = { static int mpc_i2c_get_fdr_52xx(struct device_node *node, u32 clock, u32 *real_clk) { + struct fwnode_handle *fwnode = of_fwnode_handle(node); const struct mpc_i2c_divider *div = NULL; unsigned int pvr = mfspr(SPRN_PVR); u32 divider; @@ -246,12 +247,12 @@ static int mpc_i2c_get_fdr_52xx(struct device_node *node, u32 clock, if (clock == MPC_I2C_CLOCK_LEGACY) { /* see below - default fdr = 0x3f -> div = 2048 */ - *real_clk = mpc5xxx_get_bus_frequency(node) / 2048; + *real_clk = mpc5xxx_fwnode_get_bus_frequency(fwnode) / 2048; return -EINVAL; } /* Determine divider value */ - divider = mpc5xxx_get_bus_frequency(node) / clock; + divider = mpc5xxx_fwnode_get_bus_frequency(fwnode) / clock; /* * We want to choose an FDR/DFSR that generates an I2C bus speed that @@ -266,7 +267,7 @@ static int mpc_i2c_get_fdr_52xx(struct device_node *node, u32 clock, break; } - *real_clk = mpc5xxx_get_bus_frequency(node) / div->divider; + *real_clk = mpc5xxx_fwnode_get_bus_frequency(fwnode) / div->divider; return (int)div->fdr; } diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c index 65ba6697bd7d..c469b2f3e57d 100644 --- a/drivers/net/can/mscan/mpc5xxx_can.c +++ b/drivers/net/can/mscan/mpc5xxx_can.c @@ -63,7 +63,7 @@ static u32 mpc52xx_can_get_clock(struct platform_device *ofdev, else *mscan_clksrc = MSCAN_CLKSRC_XTAL; - freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node); + freq = mpc5xxx_get_bus_frequency(&ofdev->dev); if (!freq) return 0; diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index 5ddb769bdfb4..a7f4c3c29f3e 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -924,7 +924,7 @@ static int mpc52xx_fec_probe(struct platform_device *op) /* Start with safe defaults for link connection */ priv->speed = 100; priv->duplex = DUPLEX_HALF; - priv->mdio_speed = ((mpc5xxx_get_bus_frequency(np) >> 20) / 5) << 1; + priv->mdio_speed = ((mpc5xxx_get_bus_frequency(&op->dev) >> 20) / 5) << 1; /* The current speed preconfigures the speed of the MII link */ prop = of_get_property(np, "current-speed", &prop_size); diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c index f85b5e81dfc1..95f778cce98c 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c @@ -100,8 +100,7 @@ static int mpc52xx_fec_mdio_probe(struct platform_device *of) dev_set_drvdata(dev, bus); /* set MII speed */ - out_be32(&priv->regs->mii_speed, - ((mpc5xxx_get_bus_frequency(of->dev.of_node) >> 20) / 5) << 1); + out_be32(&priv->regs->mii_speed, ((mpc5xxx_get_bus_frequency(dev) >> 20) / 5) << 1); err = of_mdiobus_register(bus, np); if (err) diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c index 152f4d83765a..d37d7a19a759 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c @@ -102,7 +102,7 @@ static int fs_enet_mdio_probe(struct platform_device *ofdev) struct resource res; struct mii_bus *new_bus; struct fec_info *fec; - int (*get_bus_freq)(struct device_node *); + int (*get_bus_freq)(struct device *); int ret = -ENOMEM, clock, speed; match = of_match_device(fs_enet_mdio_fec_match, &ofdev->dev); @@ -136,7 +136,7 @@ static int fs_enet_mdio_probe(struct platform_device *ofdev) } if (get_bus_freq) { - clock = get_bus_freq(ofdev->dev.of_node); + clock = get_bus_freq(&ofdev->dev); if (!clock) { /* Use maximum divider if clock is unknown */ dev_warn(&ofdev->dev, "could not determine IPS clock\n"); diff --git a/drivers/spi/spi-mpc52xx.c b/drivers/spi/spi-mpc52xx.c index 3ebdce804b90..bc5e36fd4288 100644 --- a/drivers/spi/spi-mpc52xx.c +++ b/drivers/spi/spi-mpc52xx.c @@ -437,7 +437,7 @@ static int mpc52xx_spi_probe(struct platform_device *op) ms->irq0 = irq_of_parse_and_map(op->dev.of_node, 0); ms->irq1 = irq_of_parse_and_map(op->dev.of_node, 1); ms->state = mpc52xx_spi_fsmstate_idle; - ms->ipb_freq = mpc5xxx_get_bus_frequency(op->dev.of_node); + ms->ipb_freq = mpc5xxx_get_bus_frequency(&op->dev); ms->gpio_cs_count = of_gpio_count(op->dev.of_node); if (ms->gpio_cs_count > 0) { master->num_chipselect = ms->gpio_cs_count; diff --git a/drivers/tty/serial/mpc52xx_uart.c b/drivers/tty/serial/mpc52xx_uart.c index e50f069b5ebb..3f1986c89694 100644 --- a/drivers/tty/serial/mpc52xx_uart.c +++ b/drivers/tty/serial/mpc52xx_uart.c @@ -1630,7 +1630,7 @@ mpc52xx_console_setup(struct console *co, char *options) return ret; } - uartclk = mpc5xxx_get_bus_frequency(np); + uartclk = mpc5xxx_fwnode_get_bus_frequency(of_fwnode_handle(np)); if (uartclk == 0) { pr_debug("Could not find uart clock frequency!\n"); return -EINVAL; @@ -1747,7 +1747,7 @@ static int mpc52xx_uart_of_probe(struct platform_device *op) /* set the uart clock to the input clock of the psc, the different * prescalers are taken into account in the set_baudrate() methods * of the respective chip */ - uartclk = mpc5xxx_get_bus_frequency(op->dev.of_node); + uartclk = mpc5xxx_get_bus_frequency(&op->dev); if (uartclk == 0) { dev_dbg(&op->dev, "Could not find uart clock frequency!\n"); return -EINVAL; -- cgit v1.2.3 From 2d386769753a71e57a1a38c7fb79013d3ac451e9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 17 Jun 2022 18:02:43 +1000 Subject: powerpc: Update asm-prototypes.h comment This header was recently cleaned up in commit 76222808fc25 ("powerpc: Move C prototypes out of asm-prototypes.h"), update the comment to reflect it's proper purpose. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220617080243.2177583-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/asm-prototypes.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 3b2bbc273055..81631e64dbeb 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -2,8 +2,9 @@ #ifndef _ASM_POWERPC_ASM_PROTOTYPES_H #define _ASM_POWERPC_ASM_PROTOTYPES_H /* - * This file is for prototypes of C functions that are only called - * from asm, and any associated variables. + * This file is for C prototypes of asm symbols that are EXPORTed. + * It allows the modversions logic to see their prototype and + * generate proper CRCs for them. * * Copyright 2016, Daniel Axtens, IBM Corporation. */ -- cgit v1.2.3 From d7f396461518c766b2436d64b6d3ba6a4c418dcf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 23 Jun 2022 12:08:36 +0200 Subject: powerpc/powermac: Remove empty function note_scsi_host() note_scsi_host() has been an empty function since commit 6ee0d9f744d4 ("[POWERPC] Remove unused old code from powermac setup code"). Remove it. Reported-by: kernel test robot Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/26f8b72a4276c0bd8ed63860c7316f6361c351b4.1655978907.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/setup.h | 1 - arch/powerpc/platforms/powermac/setup.c | 7 ------- drivers/scsi/mesh.c | 5 ----- 3 files changed, 13 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 14a0b8af738e..d8c28902cf59 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -12,7 +12,6 @@ extern unsigned long long memory_limit; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); struct device_node; -extern void note_scsi_host(struct device_node *, void *); /* Used in very early kernel initialization. */ extern unsigned long reloc_offset(void); diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index f71735ec449f..04daa7f0a03c 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -320,13 +320,6 @@ static void __init pmac_setup_arch(void) #endif /* CONFIG_ADB */ } -#ifdef CONFIG_SCSI -void note_scsi_host(struct device_node *node, void *host) -{ -} -EXPORT_SYMBOL(note_scsi_host); -#endif - static int initializing = 1; static int pmac_late_init(void) diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index a74f0c2c8ba7..84b541a57b7b 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -1882,11 +1882,6 @@ static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) goto out_release; } - /* Old junk for root discovery, that will die ultimately */ -#if !defined(MODULE) - note_scsi_host(mesh, mesh_host); -#endif - mesh_host->base = macio_resource_start(mdev, 0); mesh_host->irq = macio_irq(mdev, 0); ms = (struct mesh_state *) mesh_host->hostdata; -- cgit v1.2.3 From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Jun 2022 09:00:26 +0300 Subject: docs: rename Documentation/vm to Documentation/mm so it will be consistent with code mm directory and with Documentation/admin-guide/mm and won't be confused with virtual machines. Signed-off-by: Mike Rapoport Suggested-by: Matthew Wilcox Tested-by: Ira Weiny Acked-by: Jonathan Corbet Acked-by: Wu XiangCheng --- Documentation/ABI/testing/sysfs-kernel-mm-ksm | 2 +- Documentation/ABI/testing/sysfs-kernel-slab | 4 +- Documentation/admin-guide/kernel-parameters.txt | 10 +- Documentation/admin-guide/mm/concepts.rst | 2 +- Documentation/admin-guide/mm/damon/index.rst | 2 +- Documentation/admin-guide/mm/damon/reclaim.rst | 2 +- Documentation/admin-guide/mm/damon/usage.rst | 8 +- Documentation/admin-guide/sysctl/vm.rst | 2 +- Documentation/core-api/index.rst | 2 +- Documentation/filesystems/proc.rst | 2 +- Documentation/index.rst | 2 +- Documentation/mm/active_mm.rst | 91 ++++ Documentation/mm/arch_pgtable_helpers.rst | 260 +++++++++ Documentation/mm/balance.rst | 102 ++++ Documentation/mm/bootmem.rst | 5 + Documentation/mm/damon/api.rst | 20 + Documentation/mm/damon/design.rst | 176 ++++++ Documentation/mm/damon/faq.rst | 50 ++ Documentation/mm/damon/index.rst | 29 + Documentation/mm/free_page_reporting.rst | 40 ++ Documentation/mm/frontswap.rst | 266 +++++++++ Documentation/mm/highmem.rst | 167 ++++++ Documentation/mm/hmm.rst | 452 ++++++++++++++++ Documentation/mm/hugetlbfs_reserv.rst | 596 +++++++++++++++++++++ Documentation/mm/hwpoison.rst | 184 +++++++ Documentation/mm/index.rst | 68 +++ Documentation/mm/ksm.rst | 87 +++ Documentation/mm/memory-model.rst | 177 ++++++ Documentation/mm/mmu_notifier.rst | 99 ++++ Documentation/mm/numa.rst | 150 ++++++ Documentation/mm/oom.rst | 5 + Documentation/mm/overcommit-accounting.rst | 88 +++ Documentation/mm/page_allocation.rst | 5 + Documentation/mm/page_cache.rst | 5 + Documentation/mm/page_frags.rst | 45 ++ Documentation/mm/page_migration.rst | 288 ++++++++++ Documentation/mm/page_owner.rst | 196 +++++++ Documentation/mm/page_reclaim.rst | 5 + Documentation/mm/page_table_check.rst | 56 ++ Documentation/mm/page_tables.rst | 5 + Documentation/mm/physical_memory.rst | 5 + Documentation/mm/process_addrs.rst | 5 + Documentation/mm/remap_file_pages.rst | 33 ++ Documentation/mm/shmfs.rst | 5 + Documentation/mm/slab.rst | 5 + Documentation/mm/slub.rst | 452 ++++++++++++++++ Documentation/mm/split_page_table_lock.rst | 100 ++++ Documentation/mm/swap.rst | 5 + Documentation/mm/transhuge.rst | 187 +++++++ Documentation/mm/unevictable-lru.rst | 554 +++++++++++++++++++ Documentation/mm/vmalloc.rst | 5 + Documentation/mm/vmalloced-kernel-stacks.rst | 153 ++++++ Documentation/mm/vmemmap_dedup.rst | 223 ++++++++ Documentation/mm/z3fold.rst | 30 ++ Documentation/mm/zsmalloc.rst | 82 +++ .../zh_CN/admin-guide/mm/damon/index.rst | 2 +- .../zh_CN/admin-guide/mm/damon/reclaim.rst | 2 +- .../zh_CN/admin-guide/mm/damon/usage.rst | 8 +- .../translations/zh_CN/core-api/index.rst | 2 +- Documentation/translations/zh_CN/index.rst | 2 +- Documentation/translations/zh_CN/mm/active_mm.rst | 85 +++ Documentation/translations/zh_CN/mm/balance.rst | 81 +++ Documentation/translations/zh_CN/mm/damon/api.rst | 32 ++ .../translations/zh_CN/mm/damon/design.rst | 140 +++++ Documentation/translations/zh_CN/mm/damon/faq.rst | 48 ++ .../translations/zh_CN/mm/damon/index.rst | 32 ++ .../translations/zh_CN/mm/free_page_reporting.rst | 38 ++ Documentation/translations/zh_CN/mm/frontswap.rst | 196 +++++++ Documentation/translations/zh_CN/mm/highmem.rst | 128 +++++ Documentation/translations/zh_CN/mm/hmm.rst | 361 +++++++++++++ .../translations/zh_CN/mm/hugetlbfs_reserv.rst | 436 +++++++++++++++ Documentation/translations/zh_CN/mm/hwpoison.rst | 166 ++++++ Documentation/translations/zh_CN/mm/index.rst | 54 ++ Documentation/translations/zh_CN/mm/ksm.rst | 70 +++ .../translations/zh_CN/mm/memory-model.rst | 135 +++++ .../translations/zh_CN/mm/mmu_notifier.rst | 97 ++++ Documentation/translations/zh_CN/mm/numa.rst | 101 ++++ .../zh_CN/mm/overcommit-accounting.rst | 86 +++ Documentation/translations/zh_CN/mm/page_frags.rst | 38 ++ Documentation/translations/zh_CN/mm/page_owner.rst | 116 ++++ .../translations/zh_CN/mm/page_table_check.rst | 56 ++ .../translations/zh_CN/mm/remap_file_pages.rst | 32 ++ .../zh_CN/mm/split_page_table_lock.rst | 96 ++++ Documentation/translations/zh_CN/mm/z3fold.rst | 31 ++ Documentation/translations/zh_CN/mm/zsmalloc.rst | 78 +++ Documentation/translations/zh_CN/vm/active_mm.rst | 85 --- Documentation/translations/zh_CN/vm/balance.rst | 81 --- Documentation/translations/zh_CN/vm/damon/api.rst | 32 -- .../translations/zh_CN/vm/damon/design.rst | 140 ----- Documentation/translations/zh_CN/vm/damon/faq.rst | 48 -- .../translations/zh_CN/vm/damon/index.rst | 33 -- .../translations/zh_CN/vm/free_page_reporting.rst | 38 -- Documentation/translations/zh_CN/vm/frontswap.rst | 196 ------- Documentation/translations/zh_CN/vm/highmem.rst | 128 ----- Documentation/translations/zh_CN/vm/hmm.rst | 361 ------------- .../translations/zh_CN/vm/hugetlbfs_reserv.rst | 436 --------------- Documentation/translations/zh_CN/vm/hwpoison.rst | 166 ------ Documentation/translations/zh_CN/vm/index.rst | 54 -- Documentation/translations/zh_CN/vm/ksm.rst | 70 --- .../translations/zh_CN/vm/memory-model.rst | 135 ----- .../translations/zh_CN/vm/mmu_notifier.rst | 97 ---- Documentation/translations/zh_CN/vm/numa.rst | 101 ---- .../zh_CN/vm/overcommit-accounting.rst | 86 --- Documentation/translations/zh_CN/vm/page_frags.rst | 38 -- Documentation/translations/zh_CN/vm/page_owner.rst | 116 ---- .../translations/zh_CN/vm/page_table_check.rst | 56 -- .../translations/zh_CN/vm/remap_file_pages.rst | 32 -- .../zh_CN/vm/split_page_table_lock.rst | 96 ---- Documentation/translations/zh_CN/vm/z3fold.rst | 31 -- Documentation/translations/zh_CN/vm/zsmalloc.rst | 78 --- Documentation/translations/zh_TW/index.rst | 2 +- Documentation/vm/.gitignore | 3 - Documentation/vm/active_mm.rst | 91 ---- Documentation/vm/arch_pgtable_helpers.rst | 260 --------- Documentation/vm/balance.rst | 102 ---- Documentation/vm/bootmem.rst | 5 - Documentation/vm/damon/api.rst | 20 - Documentation/vm/damon/design.rst | 176 ------ Documentation/vm/damon/faq.rst | 50 -- Documentation/vm/damon/index.rst | 29 - Documentation/vm/free_page_reporting.rst | 40 -- Documentation/vm/frontswap.rst | 266 --------- Documentation/vm/highmem.rst | 167 ------ Documentation/vm/hmm.rst | 452 ---------------- Documentation/vm/hugetlbfs_reserv.rst | 596 --------------------- Documentation/vm/hwpoison.rst | 184 ------- Documentation/vm/index.rst | 68 --- Documentation/vm/ksm.rst | 87 --- Documentation/vm/memory-model.rst | 177 ------ Documentation/vm/mmu_notifier.rst | 99 ---- Documentation/vm/numa.rst | 150 ------ Documentation/vm/oom.rst | 5 - Documentation/vm/overcommit-accounting.rst | 88 --- Documentation/vm/page_allocation.rst | 5 - Documentation/vm/page_cache.rst | 5 - Documentation/vm/page_frags.rst | 45 -- Documentation/vm/page_migration.rst | 288 ---------- Documentation/vm/page_owner.rst | 196 ------- Documentation/vm/page_reclaim.rst | 5 - Documentation/vm/page_table_check.rst | 56 -- Documentation/vm/page_tables.rst | 5 - Documentation/vm/physical_memory.rst | 5 - Documentation/vm/process_addrs.rst | 5 - Documentation/vm/remap_file_pages.rst | 33 -- Documentation/vm/shmfs.rst | 5 - Documentation/vm/slab.rst | 5 - Documentation/vm/slub.rst | 452 ---------------- Documentation/vm/split_page_table_lock.rst | 100 ---- Documentation/vm/swap.rst | 5 - Documentation/vm/transhuge.rst | 187 ------- Documentation/vm/unevictable-lru.rst | 554 ------------------- Documentation/vm/vmalloc.rst | 5 - Documentation/vm/vmalloced-kernel-stacks.rst | 153 ------ Documentation/vm/vmemmap_dedup.rst | 223 -------- Documentation/vm/z3fold.rst | 30 -- Documentation/vm/zsmalloc.rst | 82 --- MAINTAINERS | 12 +- arch/loongarch/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- include/linux/hmm.h | 4 +- include/linux/memremap.h | 2 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 +- include/linux/swap.h | 2 +- mm/Kconfig | 2 +- mm/debug_vm_pgtable.c | 2 +- mm/frontswap.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 +- mm/hugetlb_vmemmap.c | 2 +- mm/ksm.c | 4 +- mm/mmap.c | 2 +- mm/rmap.c | 8 +- mm/sparse-vmemmap.c | 2 +- mm/util.c | 2 +- tools/vm/page_owner_sort.c | 2 +- 176 files changed, 8355 insertions(+), 8359 deletions(-) create mode 100644 Documentation/mm/active_mm.rst create mode 100644 Documentation/mm/arch_pgtable_helpers.rst create mode 100644 Documentation/mm/balance.rst create mode 100644 Documentation/mm/bootmem.rst create mode 100644 Documentation/mm/damon/api.rst create mode 100644 Documentation/mm/damon/design.rst create mode 100644 Documentation/mm/damon/faq.rst create mode 100644 Documentation/mm/damon/index.rst create mode 100644 Documentation/mm/free_page_reporting.rst create mode 100644 Documentation/mm/frontswap.rst create mode 100644 Documentation/mm/highmem.rst create mode 100644 Documentation/mm/hmm.rst create mode 100644 Documentation/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/mm/hwpoison.rst create mode 100644 Documentation/mm/index.rst create mode 100644 Documentation/mm/ksm.rst create mode 100644 Documentation/mm/memory-model.rst create mode 100644 Documentation/mm/mmu_notifier.rst create mode 100644 Documentation/mm/numa.rst create mode 100644 Documentation/mm/oom.rst create mode 100644 Documentation/mm/overcommit-accounting.rst create mode 100644 Documentation/mm/page_allocation.rst create mode 100644 Documentation/mm/page_cache.rst create mode 100644 Documentation/mm/page_frags.rst create mode 100644 Documentation/mm/page_migration.rst create mode 100644 Documentation/mm/page_owner.rst create mode 100644 Documentation/mm/page_reclaim.rst create mode 100644 Documentation/mm/page_table_check.rst create mode 100644 Documentation/mm/page_tables.rst create mode 100644 Documentation/mm/physical_memory.rst create mode 100644 Documentation/mm/process_addrs.rst create mode 100644 Documentation/mm/remap_file_pages.rst create mode 100644 Documentation/mm/shmfs.rst create mode 100644 Documentation/mm/slab.rst create mode 100644 Documentation/mm/slub.rst create mode 100644 Documentation/mm/split_page_table_lock.rst create mode 100644 Documentation/mm/swap.rst create mode 100644 Documentation/mm/transhuge.rst create mode 100644 Documentation/mm/unevictable-lru.rst create mode 100644 Documentation/mm/vmalloc.rst create mode 100644 Documentation/mm/vmalloced-kernel-stacks.rst create mode 100644 Documentation/mm/vmemmap_dedup.rst create mode 100644 Documentation/mm/z3fold.rst create mode 100644 Documentation/mm/zsmalloc.rst create mode 100644 Documentation/translations/zh_CN/mm/active_mm.rst create mode 100644 Documentation/translations/zh_CN/mm/balance.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/api.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/design.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/faq.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/index.rst create mode 100644 Documentation/translations/zh_CN/mm/free_page_reporting.rst create mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst create mode 100644 Documentation/translations/zh_CN/mm/highmem.rst create mode 100644 Documentation/translations/zh_CN/mm/hmm.rst create mode 100644 Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/translations/zh_CN/mm/hwpoison.rst create mode 100644 Documentation/translations/zh_CN/mm/index.rst create mode 100644 Documentation/translations/zh_CN/mm/ksm.rst create mode 100644 Documentation/translations/zh_CN/mm/memory-model.rst create mode 100644 Documentation/translations/zh_CN/mm/mmu_notifier.rst create mode 100644 Documentation/translations/zh_CN/mm/numa.rst create mode 100644 Documentation/translations/zh_CN/mm/overcommit-accounting.rst create mode 100644 Documentation/translations/zh_CN/mm/page_frags.rst create mode 100644 Documentation/translations/zh_CN/mm/page_owner.rst create mode 100644 Documentation/translations/zh_CN/mm/page_table_check.rst create mode 100644 Documentation/translations/zh_CN/mm/remap_file_pages.rst create mode 100644 Documentation/translations/zh_CN/mm/split_page_table_lock.rst create mode 100644 Documentation/translations/zh_CN/mm/z3fold.rst create mode 100644 Documentation/translations/zh_CN/mm/zsmalloc.rst delete mode 100644 Documentation/translations/zh_CN/vm/active_mm.rst delete mode 100644 Documentation/translations/zh_CN/vm/balance.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/api.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/design.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/faq.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/free_page_reporting.rst delete mode 100644 Documentation/translations/zh_CN/vm/frontswap.rst delete mode 100644 Documentation/translations/zh_CN/vm/highmem.rst delete mode 100644 Documentation/translations/zh_CN/vm/hmm.rst delete mode 100644 Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/translations/zh_CN/vm/hwpoison.rst delete mode 100644 Documentation/translations/zh_CN/vm/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/ksm.rst delete mode 100644 Documentation/translations/zh_CN/vm/memory-model.rst delete mode 100644 Documentation/translations/zh_CN/vm/mmu_notifier.rst delete mode 100644 Documentation/translations/zh_CN/vm/numa.rst delete mode 100644 Documentation/translations/zh_CN/vm/overcommit-accounting.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_frags.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_owner.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_table_check.rst delete mode 100644 Documentation/translations/zh_CN/vm/remap_file_pages.rst delete mode 100644 Documentation/translations/zh_CN/vm/split_page_table_lock.rst delete mode 100644 Documentation/translations/zh_CN/vm/z3fold.rst delete mode 100644 Documentation/translations/zh_CN/vm/zsmalloc.rst delete mode 100644 Documentation/vm/.gitignore delete mode 100644 Documentation/vm/active_mm.rst delete mode 100644 Documentation/vm/arch_pgtable_helpers.rst delete mode 100644 Documentation/vm/balance.rst delete mode 100644 Documentation/vm/bootmem.rst delete mode 100644 Documentation/vm/damon/api.rst delete mode 100644 Documentation/vm/damon/design.rst delete mode 100644 Documentation/vm/damon/faq.rst delete mode 100644 Documentation/vm/damon/index.rst delete mode 100644 Documentation/vm/free_page_reporting.rst delete mode 100644 Documentation/vm/frontswap.rst delete mode 100644 Documentation/vm/highmem.rst delete mode 100644 Documentation/vm/hmm.rst delete mode 100644 Documentation/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/vm/hwpoison.rst delete mode 100644 Documentation/vm/index.rst delete mode 100644 Documentation/vm/ksm.rst delete mode 100644 Documentation/vm/memory-model.rst delete mode 100644 Documentation/vm/mmu_notifier.rst delete mode 100644 Documentation/vm/numa.rst delete mode 100644 Documentation/vm/oom.rst delete mode 100644 Documentation/vm/overcommit-accounting.rst delete mode 100644 Documentation/vm/page_allocation.rst delete mode 100644 Documentation/vm/page_cache.rst delete mode 100644 Documentation/vm/page_frags.rst delete mode 100644 Documentation/vm/page_migration.rst delete mode 100644 Documentation/vm/page_owner.rst delete mode 100644 Documentation/vm/page_reclaim.rst delete mode 100644 Documentation/vm/page_table_check.rst delete mode 100644 Documentation/vm/page_tables.rst delete mode 100644 Documentation/vm/physical_memory.rst delete mode 100644 Documentation/vm/process_addrs.rst delete mode 100644 Documentation/vm/remap_file_pages.rst delete mode 100644 Documentation/vm/shmfs.rst delete mode 100644 Documentation/vm/slab.rst delete mode 100644 Documentation/vm/slub.rst delete mode 100644 Documentation/vm/split_page_table_lock.rst delete mode 100644 Documentation/vm/swap.rst delete mode 100644 Documentation/vm/transhuge.rst delete mode 100644 Documentation/vm/unevictable-lru.rst delete mode 100644 Documentation/vm/vmalloc.rst delete mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst delete mode 100644 Documentation/vm/vmemmap_dedup.rst delete mode 100644 Documentation/vm/z3fold.rst delete mode 100644 Documentation/vm/zsmalloc.rst (limited to 'arch/powerpc/include') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm index 1c9bed5595f5..d244674a9480 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm @@ -41,7 +41,7 @@ Description: Kernel Samepage Merging daemon sysfs interface sleep_millisecs: how many milliseconds ksm should sleep between scans. - See Documentation/vm/ksm.rst for more information. + See Documentation/mm/ksm.rst for more information. What: /sys/kernel/mm/ksm/merge_across_nodes Date: January 2013 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index c440f4946e12..cd5fb8fa3ddf 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -37,7 +37,7 @@ Description: The alloc_calls file is read-only and lists the kernel code locations from which allocations for this cache were performed. The alloc_calls file only contains information if debugging is - enabled for that cache (see Documentation/vm/slub.rst). + enabled for that cache (see Documentation/mm/slub.rst). What: /sys/kernel/slab//alloc_fastpath Date: February 2008 @@ -219,7 +219,7 @@ Contact: Pekka Enberg , Description: The free_calls file is read-only and lists the locations of object frees if slab debugging is enabled (see - Documentation/vm/slub.rst). + Documentation/mm/slub.rst). What: /sys/kernel/slab//free_fastpath Date: February 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..8c0ea6b6c6a9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5442,7 +5442,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -5456,13 +5456,13 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -5471,12 +5471,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_merge [MM, SLUB] Same with slab_merge. diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst index b966fcff993b..c79f1e336222 100644 --- a/Documentation/admin-guide/mm/concepts.rst +++ b/Documentation/admin-guide/mm/concepts.rst @@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux constructs an independent memory management subsystem. A node has its own set of zones, lists of free and used pages and various statistics counters. You can find more details about NUMA in -:ref:`Documentation/vm/numa.rst ` and in +:ref:`Documentation/mm/numa.rst ` and in :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `. Page cache diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 61aff88347f3..c4681fa69b9c 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -4,7 +4,7 @@ Monitoring Data Accesses ======================== -:doc:`DAMON ` allows light-weight data access monitoring. +:doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and optimize those. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 46306f1f34b1..a8bd3bd29959 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -268,4 +268,4 @@ granularity reclamation. :: .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1bb7b72414b2..5540a3a40fc9 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -30,11 +30,11 @@ DAMON provides below interfaces for different users. `. This will be removed after next LTS kernel is released, so users should move to the :ref:`sysfs interface `. - *Kernel Space Programming Interface.* - :doc:`This ` is for kernel space programmers. Using this, + :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface - :doc:`document `. + :doc:`document `. .. _sysfs_interface: @@ -185,7 +185,7 @@ controls the monitoring overhead, exist. You can set and get the values by writing to and rading from the files. For more details about the intervals and monitoring regions range, please refer -to the Design document (:doc:`/vm/damon/design`). +to the Design document (:doc:`/mm/damon/design`). contexts//targets/ --------------------- @@ -402,7 +402,7 @@ Attributes Users can get and set the ``sampling interval``, ``aggregation interval``, ``update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/vm/damon/design`. For +attributes in detail, please refer to the :doc:`/mm/damon/design`. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and 1000, and then check it again:: diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5c9aa171a0d3..4a440a7cfeb0 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -760,7 +760,7 @@ and don't use much of it. The default value is 0. -See Documentation/vm/overcommit-accounting.rst and +See Documentation/mm/overcommit-accounting.rst and mm/util.c::__vm_enough_memory() for more information. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index dedd4d853329..5b1188494bcd 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -87,7 +87,7 @@ Memory management ================= How to allocate and use memory in the kernel. Note that there is a lot -more memory-management documentation in Documentation/vm/index.rst. +more memory-management documentation in Documentation/mm/index.rst. .. toctree:: :maxdepth: 1 diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 1bc91fb8c321..8543a59f288f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1109,7 +1109,7 @@ CommitLimit yield a CommitLimit of 7.3G. For more details, see the memory overcommit documentation - in vm/overcommit-accounting. + in mm/overcommit-accounting. Committed_AS The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which diff --git a/Documentation/index.rst b/Documentation/index.rst index 67036a05b771..4737c18c97ff 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -128,7 +128,7 @@ needed). sound/index crypto/index filesystems/index - vm/index + mm/index bpf/index usb/index PCI/index diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst new file mode 100644 index 000000000000..6f8269c284ed --- /dev/null +++ b/Documentation/mm/active_mm.rst @@ -0,0 +1,91 @@ +.. _active_mm: + +========= +Active MM +========= + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + Cc'd to linux-kernel, because I don't write explanations all that often, + and when I do I feel better about more people reading them. + + On Fri, 30 Jul 1999, David Mosberger wrote: + > + > Is there a brief description someplace on how "mm" vs. "active_mm" in + > the task_struct are supposed to be used? (My apologies if this was + > discussed on the mailing lists---I just returned from vacation and + > wasn't able to follow linux-kernel for a while). + + Basically, the new setup is: + + - we have "real address spaces" and "anonymous address spaces". The + difference is that an anonymous address space doesn't care about the + user-level page tables at all, so when we do a context switch into an + anonymous address space we just leave the previous address space + active. + + The obvious use for a "anonymous address space" is any thread that + doesn't need any user mappings - all kernel threads basically fall into + this category, but even "real" threads can temporarily say that for + some amount of time they are not going to be interested in user space, + and that the scheduler might as well try to avoid wasting time on + switching the VM state around. Currently only the old-style bdflush + sync does that. + + - "tsk->mm" points to the "real address space". For an anonymous process, + tsk->mm will be NULL, for the logical reason that an anonymous process + really doesn't _have_ a real address space at all. + + - however, we obviously need to keep track of which address space we + "stole" for such an anonymous user. For that, we have "tsk->active_mm", + which shows what the currently active address space is. + + The rule is that for a process with a real address space (ie tsk->mm is + non-NULL) the active_mm obviously always has to be the same as the real + one. + + For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the + "borrowed" mm while the anonymous process is running. When the + anonymous process gets scheduled away, the borrowed address space is + returned and cleared. + + To support all that, the "struct mm_struct" now has two counters: a + "mm_users" counter that is how many "real address space users" there are, + and a "mm_count" counter that is the number of "lazy" users (ie anonymous + users) plus one if there are any real users. + + Usually there is at least one real user, but it could be that the real + user exited on another CPU while a lazy user was still active, so you do + actually get cases where you have a address space that is _only_ used by + lazy users. That is often a short-lived state, because once that thread + gets scheduled away in favour of a real thread, the "zombie" mm gets + released because "mm_count" becomes zero. + + Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any + more. "init_mm" should be considered just a "lazy context when no other + context is available", and in fact it is mainly used just at bootup when + no real VM has yet been created. So code that used to check + + if (current->mm == &init_mm) + + should generally just do + + if (!current->mm) + + instead (which makes more sense anyway - the test is basically one of "do + we have a user context", and is generally done by the page fault handler + and things like that). + + Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, + because it slightly changes the interfaces to accommodate the alpha (who + would have thought it, but the alpha actually ends up having one of the + ugliest context switch codes - unlike the other architectures where the MM + and register state is separate, the alpha PALcode joins the two, and you + need to switch both together). + + (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst new file mode 100644 index 000000000000..cbaee9e59241 --- /dev/null +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -0,0 +1,260 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _arch_page_table_helpers: + +=============================== +Architecture Page Table Helpers +=============================== + +Generic MM expects architectures (with MMU) to provide helpers to create, access +and modify page table entries at various level for different memory functions. +These page table helpers need to conform to a common semantics across platforms. +Following tables describe the expected semantics which can also be tested during +boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug +test need to be in sync. + + +PTE Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pte_same | Tests whether both PTE entries are the same | ++---------------------------+--------------------------------------------------+ +| pte_bad | Tests a non-table mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_present | Tests a valid mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_young | Tests a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_dirty | Tests a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_write | Tests a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_special | Tests a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_protnone | Tests a PROT_NONE PTE | ++---------------------------+--------------------------------------------------+ +| pte_devmap | Tests a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_soft_dirty | Tests a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkyoung | Creates a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkold | Creates an old PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdirty | Creates a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkclean | Creates a clean PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkwrite | Creates a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_wrprotect | Creates a write protected PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkspecial | Creates a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mksoft_dirty | Creates a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_clear_soft_dirty | Clears a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mknotpresent | Invalidates a mapped PTE | ++---------------------------+--------------------------------------------------+ +| ptep_clear | Clears a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | ++---------------------------+--------------------------------------------------+ +| ptep_test_and_clear_young | Clears young from a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_wrprotect | Converts into a write protected PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_access_flags | Converts into a more permissive PTE | ++---------------------------+--------------------------------------------------+ + + +PMD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pmd_same | Tests whether both PMD entries are the same | ++---------------------------+--------------------------------------------------+ +| pmd_bad | Tests a non-table mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_leaf | Tests a leaf mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_huge | Tests a HugeTLB mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | ++---------------------------+--------------------------------------------------+ +| pmd_present | Tests a valid mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_young | Tests a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_dirty | Tests a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_write | Tests a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_special | Tests a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_protnone | Tests a PROT_NONE PMD | ++---------------------------+--------------------------------------------------+ +| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_soft_dirty | Tests a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkyoung | Creates a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkold | Creates an old PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdirty | Creates a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkclean | Creates a clean PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkwrite | Creates a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_wrprotect | Creates a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkspecial | Creates a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mksoft_dirty | Creates a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_clear_soft_dirty | Clears a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkinvalid | Invalidates a mapped PMD [1] | ++---------------------------+--------------------------------------------------+ +| pmd_set_huge | Creates a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmd_clear_huge | Clears a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear_full | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_test_and_clear_young | Clears young from a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_wrprotect | Converts into a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_access_flags | Converts into a more permissive PMD | ++---------------------------+--------------------------------------------------+ + + +PUD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pud_same | Tests whether both PUD entries are the same | ++---------------------------+--------------------------------------------------+ +| pud_bad | Tests a non-table mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_leaf | Tests a leaf mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_huge | Tests a HugeTLB mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | ++---------------------------+--------------------------------------------------+ +| pud_present | Tests a valid mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_young | Tests a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_dirty | Tests a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_write | Tests a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_devmap | Tests a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkyoung | Creates a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkold | Creates an old PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdirty | Creates a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkclean | Creates a clean PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkwrite | Creates a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_wrprotect | Creates a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkinvalid | Invalidates a mapped PUD [1] | ++---------------------------+--------------------------------------------------+ +| pud_set_huge | Creates a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pud_clear_huge | Clears a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear_full | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_test_and_clear_young | Clears young from a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_wrprotect | Converts into a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_access_flags | Converts into a more permissive PUD | ++---------------------------+--------------------------------------------------+ + + +HugeTLB Page Table Helpers +========================== + ++---------------------------+--------------------------------------------------+ +| pte_huge | Tests a HugeTLB | ++---------------------------+--------------------------------------------------+ +| pte_mkhuge | Creates a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_dirty | Tests a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_write | Tests a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkdirty | Creates a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkwrite | Creates a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_wrprotect | Creates a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_get_and_clear | Clears a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | ++---------------------------+--------------------------------------------------+ + + +SWAP Page Table Helpers +======================== + ++---------------------------+--------------------------------------------------+ +| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | ++---------------------------+--------------------------------------------------+ +| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | ++---------------------------+--------------------------------------------------+ +| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| is_migration_entry | Tests a migration (read or write) swapped entry | ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ + +[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst new file mode 100644 index 000000000000..6a1fadf3e173 --- /dev/null +++ b/Documentation/mm/balance.rst @@ -0,0 +1,102 @@ +.. _balance: + +================ +Memory Balancing +================ + +Started Jan 2000 by Kanoj Sarcar + +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. + +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. + +__GFP_IO allocation requests are made to prevent file system deadlocks. + +In the absence of non sleepable allocation requests, it seems detrimental +to be doing balancing. Page reclamation can be kicked off lazily, that +is, only when needed (aka zone free memory is 0), instead of making it +a proactive process. + +That being said, the kernel should try to fulfill requests for direct +mapped pages from the direct mapped pool, instead of falling back on +the dma pool, so as to keep the dma pool filled for dma requests (atomic +or not). A similar argument applies to highmem and direct mapped pages. +OTOH, if there is a lot of free dma pages, it is preferable to satisfy +regular memory requests by allocating one from the dma pool, instead +of incurring the overhead of regular zone balancing. + +In 2.2, memory balancing/page reclamation would kick off only when the +_total_ number of free pages fell below 1/64 th of total memory. With the +right ratio of dma and regular memory, it is quite possible that balancing +would not be done even when the dma zone was completely empty. 2.2 has +been running production machines of varying memory sizes, and seems to be +doing fine even with the presence of this problem. In 2.3, due to +HIGHMEM, this problem is aggravated. + +In 2.3, zone balancing can be done in one of two ways: depending on the +zone size (and possibly of the size of lower class zones), we can decide +at init time how many free pages we should aim for while balancing any +zone. The good part is, while balancing, we do not need to look at sizes +of lower class zones, the bad part is, we might do too frequent balancing +due to ignoring possibly lower usage in the lower class zones. Also, +with a slight change in the allocation routine, it is possible to reduce +the memclass() macro to be a simple equality. + +Another possible solution is that we balance only when the free memory +of a zone _and_ all its lower class zones falls below 1/64th of the +total memory in the zone and its lower class zones. This fixes the 2.2 +balancing problem, and stays as close to 2.2 behavior as possible. Also, +the balancing algorithm works the same way on the various architectures, +which have different numbers and types of zones. If we wanted to get +fancy, we could assign different weights to free pages in different +zones in the future. + +Note that if the size of the regular zone is huge compared to dma zone, +it becomes less significant to consider the free dma pages while +deciding whether to balance the regular zone. The first solution +becomes more attractive then. + +The appended patch implements the second solution. It also "fixes" two +problems: first, kswapd is woken up as in 2.2 on low memory conditions +for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, +so as to give a fighting chance for replace_with_highmem() to get a +HIGHMEM page, as well as to ensure that HIGHMEM allocations do not +fall back into regular zone. This also makes sure that HIGHMEM pages +are not leaked (for example, in situations where a HIGHMEM page is in +the swapcache but is not being used by anyone) + +kswapd also needs to know about the zones it should balance. kswapd is +primarily needed in a situation where balancing can not be done, +probably because all allocation requests are coming from intr context +and all process contexts are sleeping. For 2.3, kswapd does not really +need to balance the highmem zone, since intr context does not request +highmem pages. kswapd looks at the zone_wake_kswapd field in the zone +structure to decide whether a zone needs balancing. + +Page stealing from process memory and shm is done if stealing the page would +alleviate memory pressure on any zone in the page's node that has fallen below +its watermark. + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. + + +(Good) Ideas that I have heard: + +1. Dynamic experience should influence balancing: number of failed requests + for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) +2. Implement a replace_with_highmem()-like replace_with_regular() to preserve + dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/mm/bootmem.rst b/Documentation/mm/bootmem.rst new file mode 100644 index 000000000000..eb2b31eedfa1 --- /dev/null +++ b/Documentation/mm/bootmem.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Boot Memory +=========== diff --git a/Documentation/mm/damon/api.rst b/Documentation/mm/damon/api.rst new file mode 100644 index 000000000000..08f34df45523 --- /dev/null +++ b/Documentation/mm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst new file mode 100644 index 000000000000..0cff6fac6b7e --- /dev/null +++ b/Documentation/mm/damon/design.rst @@ -0,0 +1,176 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. We call the low +level primitives implementations monitoring operations. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate +monitoring operations. If appropriate one is not provided, users can implement +the operations on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Monitoring Operations +========================================================================= + +The monitoring operations are defined in two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementations of the operations for the physical +and virtual address spaces. Below two subsections describe how those work. + + +VMA-based Target Address Range Construction +------------------------------------------- + +This is only for the virtual address space monitoring operations +implementation. That for the physical address space simply asks users to +manually set the monitoring target address ranges. + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +PTE Accessed-bit Based Access Check +----------------------------------- + +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling +the interference is the responsibility of sysadmins. However, it solves the +conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, +as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``update interval``, ``minimum number of regions``, and ``maximum number of +regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON allows the +monitoring operations to check dynamic changes including memory mapping changes +and applies it to monitoring operations-related data structures such as the +abstracted monitoring target memory area only for each of a user-specified time +interval (``update interval``). diff --git a/Documentation/mm/damon/faq.rst b/Documentation/mm/damon/faq.rst new file mode 100644 index 000000000000..dde7e2414ee6 --- /dev/null +++ b/Documentation/mm/damon/faq.rst @@ -0,0 +1,50 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific monitoring operations including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +and the physical memory by default, for a reference and convenient use. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst new file mode 100644 index 000000000000..48c0bbff98b2 --- /dev/null +++ b/Documentation/mm/damon/index.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a data access monitoring framework subsystem for the Linux kernel. +The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel's memory management mechanisms can +make advanced decisions. Experimental memory management optimization works +that incurring high data accesses monitoring overhead could implemented again. +In user space, meanwhile, users who have some special workloads can write +personalized applications for better understanding and optimizations of their +workloads and systems. + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/mm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst new file mode 100644 index 000000000000..8c05e62d8b2b --- /dev/null +++ b/Documentation/mm/free_page_reporting.rst @@ -0,0 +1,40 @@ +.. _free_page_reporting: + +===================== +Free Page Reporting +===================== + +Free page reporting is an API by which a device can register to receive +lists of pages that are currently unused by the system. This is useful in +the case of virtualization where a guest is then able to use this data to +notify the hypervisor that it is no longer using certain pages in memory. + +For the driver, typically a balloon driver, to use of this functionality +it will allocate and initialize a page_reporting_dev_info structure. The +field within the structure it will populate is the "report" function +pointer used to process the scatterlist. It must also guarantee that it can +handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per +call to the function. A call to page_reporting_register will register the +page reporting interface with the reporting framework assuming no other +page reporting devices are already registered. + +Once registered the page reporting API will begin reporting batches of +pages to the driver. The API will start reporting pages 2 seconds after +the interface is registered and will continue to do so 2 seconds after any +page of a sufficiently high order is freed. + +Pages reported will be stored in the scatterlist passed to the reporting +function with the final entry having the end bit set in entry nent - 1. +While pages are being processed by the report function they will not be +accessible to the allocator. Once the report function has been completed +the pages will be returned to the free area from which they were obtained. + +Prior to removing a driver that is making use of free page reporting it +is necessary to call page_reporting_unregister to have the +page_reporting_dev_info structure that is currently in use by free page +reporting removed. Doing this will prevent further reports from being +issued via the interface. If another driver or the same driver is +registered it is possible for it to resume where the previous driver had +left off in terms of reporting free pages. + +Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst new file mode 100644 index 000000000000..feecc5e24477 --- /dev/null +++ b/Documentation/mm/frontswap.rst @@ -0,0 +1,266 @@ +.. _frontswap: + +========= +Frontswap +========= + +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "store" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "load" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further stores with that swap type. + +Once a page is successfully stored, a matching load on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the store returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a store returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the `/sys/kernel/debug/frontswap` directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +``failed_stores`` + how many store attempts have failed + +``loads`` + how many loads were attempted (all should succeed) + +``succ_stores`` + how many store attempts have succeeded + +``invalidates`` + how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ +=== + +* Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device". + +Frontswap with a fairly small impact on the kernel, +provides a huge amount of flexibility for more dynamic, flexible RAM +utilization in various system configurations: + +In the single kernel case, aka "zcache", pages are compressed and +stored in local memory, thus increasing the total anonymous pages +that can be safely kept in RAM. Zcache essentially trades off CPU +cycles used in compression/decompression for better memory utilization. +Benchmarks have shown little or no impact when memory pressure is +low while providing a significant performance improvement (25%+) +on some workloads under high memory pressure. + +"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory +support for clustered systems. Frontswap pages are locally compressed +as in zcache, but then "remotified" to another system's RAM. This +allows RAM to be dynamically load-balanced back-and-forth as needed, +i.e. when system A is overcommitted, it can swap to system B, and +vice versa. RAMster can also be configured as a memory server so +many servers in a cluster can swap, dynamically as needed, to a single +server configured with a large amount of RAM... without pre-configuring +how much of the RAM is available for each of the clients! + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "selfballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM (if overall host system memory +conditions allow), thus mitigating the potentially awful performance impact +of unplanned swapping. + +A KVM implementation is underway and has been RFC'ed to lkml. And, +using frontswap, investigation is also underway on the use of NVM as +a memory extension technology. + +* Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "store" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +When swap pages are stored in transcendent memory instead of written +out to disk, there is a side effect that this may create more memory +pressure that can potentially outweigh the other advantages. A +backend, such as zcache, must implement policies to carefully (but +dynamically) manage memory limits to ensure this doesn't happen. + +* OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "store" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_store is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_store returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_load() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +store" and (possibly) a "frontswap backend loads", which are presumably much +faster. + +* Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap, + or maybe swap-over-nbd/NFS)? + +No. First, the existing swap subsystem doesn't allow for any kind of +swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, +but this would require fairly drastic changes. Even if it were +rewritten, the existing swap subsystem uses the block I/O layer which +assumes a swap device is fixed size and any page in it is linearly +addressable. Frontswap barely touches the existing swap subsystem, +and works around the constraints of the block I/O subsystem to provide +a great deal of flexibility and dynamicity. + +For example, the acceptance of any swap page by the frontswap backend is +entirely unpredictable. This is critical to the definition of frontswap +backends because it grants completely dynamic discretion to the +backend. In zcache, one cannot know a priori how compressible a page is. +"Poorly" compressible pages can be rejected, and "poorly" can itself be +defined dynamically depending on current memory constraints. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread +is free to manipulate the pages stored by frontswap. For example, +the "remotification" thread in RAMster uses standard asynchronous +kernel sockets to move compressed frontswap pages to a remote machine. +Similarly, a KVM guest-side implementation could do in-guest compression +and use "batched" hypercalls. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. + +There is a downside to the transcendent memory specifications for +frontswap: Since any "store" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. This means +that frontswap cannot contain more pages than the total of swapon'd +swap devices. For example, if NO swap device is configured on some +installation, frontswap is useless. Swapless portable devices +can still use frontswap but a backend for such devices must configure +some kind of "ghost" swap device and ensure that it is never used. + +* Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +* Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst new file mode 100644 index 000000000000..c9887f241c6c --- /dev/null +++ b/Documentation/mm/highmem.rst @@ -0,0 +1,167 @@ +.. _highmem: + +==================== +High Memory Handling +==================== + +By: Peter Zijlstra + +.. contents:: :local: + +What Is High Memory? +==================== + +High memory (highmem) is used when the size of physical memory approaches or +exceeds the maximum size of virtual memory. At that point it becomes +impossible for the kernel to keep all of the available physical memory mapped +at all times. This means the kernel needs to start using temporary mappings of +the pieces of physical memory that it wants to access. + +The part of (physical) memory not covered by a permanent mapping is what we +refer to as 'highmem'. There are various architecture dependent constraints on +where exactly that border lies. + +In the i386 arch, for example, we choose to map the kernel into every process's +VM space so that we don't have to pay the full TLB invalidation costs for +kernel entry/exit. This means the available virtual memory space (4GiB on +i386) has to be divided between user and kernel space. + +The traditional split for architectures using this approach is 3:1, 3GiB for +userspace and the top 1GiB for kernel space:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +This means that the kernel can at most map 1GiB of physical memory at any one +time, but because we need virtual address space for other things - including +temporary maps to access the rest of the physical memory - the actual direct +map will typically be less (usually around ~896MiB). + +Other architectures that have mm context tagged TLBs can have separate kernel +and user maps. Some hardware (like some ARMs), however, have limited virtual +space when they use mm context tags. + + +Temporary Virtual Mappings +========================== + +The kernel contains several ways of creating temporary mappings. The following +list shows them in order of preference of use. + +* kmap_local_page(). This function is used to require short term mappings. + It can be invoked from any context (including interrupts) but the mappings + can only be used in the context which acquired them. + + This function should be preferred, where feasible, over all the others. + + These mappings are thread-local and CPU-local, meaning that the mapping + can only be accessed from within this thread and the thread is bound the + CPU while the mapping is active. Even if the thread is preempted (since + preemption is never disabled by the function) the CPU can not be + unplugged from the system via CPU-hotplug until the mapping is disposed. + + It's valid to take pagefaults in a local kmap region, unless the context + in which the local mapping is acquired does not allow it for other reasons. + + kmap_local_page() always returns a valid virtual address and it is assumed + that kunmap_local() will never fail. + + Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain + extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered + because the map implementation is stack based. See kmap_local_page() kdocs + (included in the "Functions" section) for details on how to manage nested + mappings. + +* kmap_atomic(). This permits a very short duration mapping of a single + page. Since the mapping is restricted to the CPU that issued it, it + performs well, but the issuing task is therefore required to stay on that + CPU until it has finished, lest some other task displace its mappings. + + kmap_atomic() may also be used by interrupt contexts, since it does not + sleep and the callers too may not sleep until after kunmap_atomic() is + called. + + Each call of kmap_atomic() in the kernel creates a non-preemptible section + and disable pagefaults. This could be a source of unwanted latency. Therefore + users should prefer kmap_local_page() instead of kmap_atomic(). + + It is assumed that k[un]map_atomic() won't fail. + +* kmap(). This should be used to make short duration mapping of a single + page with no restrictions on preemption or migration. It comes with an + overhead as mapping space is restricted and protected by a global lock + for synchronization. When mapping is no longer needed, the address that + the page was mapped to must be released with kunmap(). + + Mapping changes must be propagated across all the CPUs. kmap() also + requires global TLB invalidation when the kmap's pool wraps and it might + block when the mapping space is fully utilized until a slot becomes + available. Therefore, kmap() is only callable from preemptible context. + + All the above work is necessary if a mapping must last for a relatively + long time but the bulk of high-memory mappings in the kernel are + short-lived and only used in one place. This means that the cost of + kmap() is mostly wasted in such cases. kmap() was not intended for long + term mappings but it has morphed in that direction and its use is + strongly discouraged in newer code and the set of the preceding functions + should be preferred. + + On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have + no real work to do because a 64-bit address space is more than sufficient to + address all the physical memory whose pages are permanently mapped. + +* vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. + + +Cost of Temporary Mappings +========================== + +The cost of creating temporary mappings can be quite high. The arch has to +manipulate the kernel's page tables, the data TLB and/or the MMU's registers. + +If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping +simply with a bit of arithmetic that will convert the page struct address into +a pointer to the page contents rather than juggling mappings about. In such a +case, the unmap operation may be a null operation. + +If CONFIG_MMU is not set, then there can be no temporary mappings and no +highmem. In such a case, the arithmetic approach will also be used. + + +i386 PAE +======== + +The i386 arch, under some circumstances, will permit you to stick up to 64GiB +of RAM into your 32-bit machine. This has a number of consequences: + +* Linux needs a page-frame structure for each page in the system and the + pageframes need to live in the permanent mapping, which means: + +* you can have 896M/sizeof(struct page) page-frames at most; with struct + page being 32-bytes that would end up being something in the order of 112G + worth of pages; the kernel, however, needs to store more than just + page-frames in that memory... + +* PAE makes your page tables larger - which slows the system down as more + data has to be accessed to traverse in TLB fills and the like. One + advantage is that PAE has more PTE bits and can provide advanced features + like NX and PAT. + +The general recommendation is that you don't use more than 8GiB on a 32-bit +machine - although more might work for you and your workload, you're pretty +much on your own - don't expect kernel developers to really care much if things +come apart. + + +Functions +========= + +.. kernel-doc:: include/linux/highmem.h +.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst new file mode 100644 index 000000000000..f2a59ed82ed3 --- /dev/null +++ b/Documentation/mm/hmm.rst @@ -0,0 +1,452 @@ +.. _hmm: + +===================================== +Heterogeneous Memory Management (HMM) +===================================== + +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). + +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program addresses coherently with +the CPU meaning that any valid pointer on the CPU is also a valid pointer +for the device. This is becoming mandatory to simplify the use of advanced +heterogeneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and the purpose of HMM in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows +leveraging the device DMA engine. + +.. contents:: :local: + +Problems of using a device specific memory allocator +==================================================== + +Devices with a large amount of on board memory (several gigabytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. + +Split address space happens because devices can only access memory allocated +through a device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. + +Concretely, this means that code that wants to leverage devices like GPUs needs +to copy objects between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). + +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +for complex data sets (list, tree, ...) it's hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its +elements. This is error prone and programs get harder to debug because of the +duplicate data set and addresses. + +Split address space also means that libraries cannot transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. + +Duplicating each library API to accept as input or output memory allocated by +each device specific allocator is not a viable option. It would lead to a +combinatorial explosion in the library entry points. + +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. + + +I/O bus, device memory characteristics +====================================== + +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from a CPU is even more +limited. More often than not, it is not cache coherent. + +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from the device on main memory. This is worse +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot +be considered the same as regular memory from the kernel point of view. + +Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. + +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow +two-way cache coherency between CPU and device and allow all atomic operations the +architecture supports. Sadly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. + +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while the device is using it (blocking CPU access while it happens). + + +Shared address space and migration +================================== + +HMM intends to provide two main features. The first one is to share the address +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in +the process address space. + +To achieve this, HMM offers a set of helpers to populate the device page table +while keeping track of CPU page table updates. Device page table updates are +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This cannot be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. + +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of device memory. Those pages +are special because the CPU cannot map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page that is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm +mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate is left to the device driver. + +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. + +With these two features, HMM not only allows a device to mirror process address +space and keeps both CPU and device page tables synchronized, but also +leverages device memory by migrating the part of the data set that is actively being +used by the device. + + +Address space mirroring implementation and API +============================================== + +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A +device driver that wants to mirror a process address space must start with the +registration of a mmu_interval_notifier:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +During the ops->invalidate() callback the device driver must perform the +update action to the range (mark range read only, or fully unmap, etc.). The +device must complete the update before the driver callback returns. + +When the device driver wants to populate a range of virtual addresses, it can +use:: + + int hmm_range_fault(struct hmm_range *range); + +It will trigger a page fault on missing or read-only entries if write access is +requested (see below). Page faults use the generic mm page fault code path just +like a CPU page fault. + +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. + +Locking within the sync_cpu_device_pagetables() callback is the most important +aspect the driver must respect in order to keep things properly synchronized. +The usage pattern is:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +The driver->update lock is the same lock that the driver takes inside its +invalidate() callback. That lock must be held before calling +mmu_interval_read_retry() to avoid any race with a concurrent CPU page table +update. + +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify +fault or snapshot policy for the whole range instead of having to set them +for each entry in the pfns array. + +For instance if the device driver wants pages for a range with at least read +permission, it sets:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all pages +in the range with at least read permission. + +Now let's say the driver wants to do the same except for one page in the range for +which it wants to have write permission. Now driver set:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +With this, HMM will fault in all pages with at least read (i.e., valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission i.e., if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +After hmm_range_fault completes the flag bits are set to the current state of +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is +writable. + + +Represent and manage device memory from core kernel point of view +================================================================= + +Several different designs were tried to support device memory. The first one +used a device specific data structure to keep information about migrated memory +and HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. + +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. + +Migration to and from device memory +=================================== + +Because the CPU cannot access device memory directly, the device driver must +use hardware DMA or device specific load/store instructions to migrate data. +The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() +functions are designed to make drivers easier to write and to centralize common +code across drivers. + +Before migrating pages to device private memory, special device private +``struct page`` need to be created. These will be used as special "swap" +page table entries so that a CPU process will fault if it tries to access +a page that has been migrated to device private memory. + +These can be allocated and freed with:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +There are also devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages(), and devm_release_mem_region() when the resources can +be tied to a ``struct device``. + +The overall migration steps are similar to migrating NUMA pages within system +memory (see :ref:`Page migration `) but the steps are split +between device driver specific code and shared common code: + +1. ``mmap_read_lock()`` + + The device driver has to pass a ``struct vm_area_struct`` to + migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to + be held for the duration of the migration. + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + The device driver initializes the ``struct migrate_vma`` fields and passes + the pointer to migrate_vma_setup(). The ``args->flags`` field is used to + filter which source pages should be migrated. For example, setting + ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in + device private memory. If the latter flag is set, the ``args->pgmap_owner`` + field is used to identify device private pages owned by the driver. This + avoids trying to migrate device private pages residing in other devices. + Currently only anonymous private VMA ranges can be migrated to or from + system memory and device private memory. + + One of the first steps migrate_vma_setup() does is to invalidate other + device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and + ``mmu_notifier_invalidate_range_end()`` calls around the page table + walks to fill in the ``args->src`` array with PFNs to be migrated. + The ``invalidate_range_start()`` callback is passed a + ``struct mmu_notifier_range`` with the ``event`` field set to + ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to + the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is + allows the device driver to skip the invalidation callback and only + invalidate device private MMU mappings that are actually migrating. + This is explained more in the next section. + + While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` + entry results in a valid "zero" PFN stored in the ``args->src`` array. + This lets the driver allocate device private memory and clear it instead + of copying a page of zeros. Valid PTE entries to system memory or + device private struct pages will be locked with ``lock_page()``, isolated + from the LRU (if system memory since device private pages are not on + the LRU), unmapped from the process, and a special migration PTE is + inserted in place of the original PTE. + migrate_vma_setup() also clears the ``args->dst`` array. + +3. The device driver allocates destination pages and copies source pages to + destination pages. + + The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` + bit is set and skips entries that are not migrating. The device driver + can also choose to skip migrating a page by not filling in the ``dst`` + array for that page. + + The driver then allocates either a device private struct page or a + system memory page, locks the page with ``lock_page()``, and fills in the + ``dst`` array entry with:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + Now that the driver knows that this page is being migrated, it can + invalidate device private MMU mappings and copy device private memory + to system memory or another device private page. The core Linux kernel + handles CPU page table invalidations so the device driver only has to + invalidate its own MMU mappings. + + The driver can use ``migrate_pfn_to_page(src[i])`` to get the + ``struct page`` of the source and either copy the source page to the + destination or clear the destination device private memory if the pointer + is ``NULL`` meaning the source page was not populated in system memory. + +4. ``migrate_vma_pages()`` + + This step is where the migration is actually "committed". + + If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this + is where the newly allocated page is inserted into the CPU's page table. + This can fail if a CPU thread faults on the same page. However, the page + table is locked and only one of the new pages will be inserted. + The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared + if it loses the race. + + If the source page was locked, isolated, etc. the source ``struct page`` + information is now copied to destination ``struct page`` finalizing the + migration on the CPU side. + +5. Device driver updates device MMU page tables for pages still migrating, + rolling back pages not migrating. + + If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device + driver can update the device MMU and set the write enable bit if the + ``MIGRATE_PFN_WRITE`` bit is set. + +6. ``migrate_vma_finalize()`` + + This step replaces the special migration page table entry with the new + page's page table entry and releases the reference to the source and + destination ``struct page``. + +7. ``mmap_read_unlock()`` + + The lock can now be released. + +Exclusive access memory +======================= + +Some devices have features such as atomic PTE bits that can be used to implement +atomic access to system memory. To support atomic operations to a shared virtual +memory page such a device needs access to that page which is exclusive of any +userspace access from the CPU. The ``make_device_exclusive_range()`` function +can be used to make a memory range inaccessible from userspace. + +This replaces all mappings for pages in the given range with special swap +entries. Any attempt to access the swap entry results in a fault which is +resovled by replacing the entry with the original mapping. A driver gets +notified that the mapping has been changed by MMU notifiers, after which point +it will no longer have exclusive access to the page. Exclusive access is +guranteed to last until the driver drops the page lock and page reference, at +which point any CPU faults on the page may proceed as described. + +Memory cgroup (memcg) and rss accounting +======================================== + +For now, device memory is accounted as any regular page in rss counters (either +anonymous if device page is used for anonymous, file if device page is used for +file backed page, or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. + +A drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before +deciding to account device memory differently. + + +Same decision was made for memory cgroup. Device memory pages are accounted +against same memory cgroup a regular page would be accounted to. This does +simplify migration to and from device memory. This also means that migration +back from device memory to regular memory cannot fail because it would +go above memory cgroup limit. We might revisit this choice latter on once we +get more experience in how device memory is used and its impact on memory +resource control. + + +Note that device memory can never be pinned by a device driver nor through GUP +and thus such memory is always free upon process exit. Or when last reference +is dropped in case of shared memory or file backed memory. diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..f143954e0d05 --- /dev/null +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -0,0 +1,596 @@ +.. _hugetlbfs_reserve: + +===================== +Hugetlbfs Reservation +===================== + +Overview +======== + +Huge pages as described at :ref:`hugetlbpage` are typically +preallocated for application use. These huge pages are instantiated in a +task's address space at page fault time if the VMA indicates huge pages are +to be used. If no huge page exists at page fault time, the task is sent +a SIGBUS and often dies an unhappy death. Shortly after huge page support +was added, it was determined that it would be better to detect a shortage +of huge pages at mmap() time. The idea is that if there were not enough +huge pages to cover the mapping, the mmap() would fail. This was first +done with a simple check in the code at mmap() time to determine if there +were enough free huge pages to cover the mapping. Like most things in the +kernel, the code has evolved over time. However, the basic idea was to +'reserve' huge pages at mmap() time to ensure that huge pages would be +available for page faults in that mapping. The description below attempts to +describe how huge page reserve processing is done in the v4.10 kernel. + + +Audience +======== +This description is primarily targeted at kernel developers who are modifying +hugetlbfs code. + + +The Data Structures +=================== + +resv_huge_pages + This is a global (per-hstate) count of reserved huge pages. Reserved + huge pages are only available to the task which reserved them. + Therefore, the number of huge pages generally available is computed + as (``free_huge_pages - resv_huge_pages``). +Reserve Map + A reserve map is described by the structure:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + There is one reserve map for each huge page mapping in the system. + The regions list within the resv_map describes the regions within + the mapping. A region is described as:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + The 'from' and 'to' fields of the file region structure are huge page + indices into the mapping. Depending on the type of mapping, a + region in the reserv_map may indicate reservations exist for the + range, or reservations do not exist. +Flags for MAP_PRIVATE Reservations + These are stored in the bottom bits of the reservation map pointer. + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + Indicates this task is the owner of the reservations + associated with the mapping. + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + Indicates task originally mapping this range (and creating + reserves) has unmapped a page from this task (the child) + due to a failed COW. +Page Flags + The PagePrivate page flag is used to indicate that a huge page + reservation must be restored when the huge page is freed. More + details will be discussed in the "Freeing huge pages" section. + + +Reservation Map Location (Private or Shared) +============================================ + +A huge page mapping or segment is either private or shared. If private, +it is typically only available to a single address space (task). If shared, +it can be mapped into multiple address spaces (tasks). The location and +semantics of the reservation map is significantly different for the two types +of mappings. Location differences are: + +- For private mappings, the reservation map hangs off the VMA structure. + Specifically, vma->vm_private_data. This reserve map is created at the + time the mapping (mmap(MAP_PRIVATE)) is created. +- For shared mappings, the reservation map hangs off the inode. Specifically, + inode->i_mapping->private_data. Since shared mappings are always backed + by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode + contains a reservation map. As a result, the reservation map is allocated + when the inode is created. + + +Creating Reservations +===================== +Reservations are created when a huge page backed shared memory segment is +created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). +These operations result in a call to the routine hugetlb_reserve_pages():: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +The first thing hugetlb_reserve_pages() does is check if the NORESERVE +flag was specified in either the shmget() or mmap() call. If NORESERVE +was specified, then this routine returns immediately as no reservations +are desired. + +The arguments 'from' and 'to' are huge page indices into the mapping or +underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to +the length of the segment/mapping. For mmap(), the offset argument could +be used to specify the offset into the underlying file. In such a case, +the 'from' and 'to' arguments have been adjusted by this offset. + +One of the big differences between PRIVATE and SHARED mappings is the way +in which reservations are represented in the reservation map. + +- For shared mappings, an entry in the reservation map indicates a reservation + exists or did exist for the corresponding page. As reservations are + consumed, the reservation map is not modified. +- For private mappings, the lack of an entry in the reservation map indicates + a reservation exists for the corresponding page. As reservations are + consumed, entries are added to the reservation map. Therefore, the + reservation map can also be used to determine which reservations have + been consumed. + +For private mappings, hugetlb_reserve_pages() creates the reservation map and +hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set +to indicate this VMA owns the reservations. + +The reservation map is consulted to determine how many huge page reservations +are needed for the current mapping/segment. For private mappings, this is +always the value (to - from). However, for shared mappings it is possible that +some reservations may already exist within the range (to - from). See the +section :ref:`Reservation Map Modifications ` +for details on how this is accomplished. + +The mapping may be associated with a subpool. If so, the subpool is consulted +to ensure there is sufficient space for the mapping. It is possible that the +subpool has set aside reservations that can be used for the mapping. See the +section :ref:`Subpool Reservations ` for more details. + +After consulting the reservation map and subpool, the number of needed new +reservations is known. The routine hugetlb_acct_memory() is called to check +for and take the requested number of reservations. hugetlb_acct_memory() +calls into routines that potentially allocate and adjust surplus page counts. +However, within those routines the code is simply checking to ensure there +are enough free huge pages to accommodate the reservation. If there are, +the global reservation count resv_huge_pages is adjusted something like the +following:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +Note that the global lock hugetlb_lock is held when checking and adjusting +these counters. + +If there were enough free huge pages and the global count resv_huge_pages +was adjusted, then the reservation map associated with the mapping is +modified to reflect the reservations. In the case of a shared mapping, a +file_region will exist that includes the range 'from' - 'to'. For private +mappings, no modifications are made to the reservation map as lack of an +entry indicates a reservation exists. + +If hugetlb_reserve_pages() was successful, the global reservation count and +reservation map associated with the mapping will be modified as required to +ensure reservations exist for the range 'from' - 'to'. + +.. _consume_resv: + +Consuming Reservations/Allocating a Huge Page +============================================= + +Reservations are consumed when huge pages associated with the reservations +are allocated and instantiated in the corresponding mapping. The allocation +is performed within the routine alloc_huge_page():: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page is passed a VMA pointer and a virtual address, so it can +consult the reservation map to determine if a reservation exists. In addition, +alloc_huge_page takes the argument avoid_reserve which indicates reserves +should not be used even if it appears they have been set aside for the +specified address. The avoid_reserve argument is most often used in the case +of Copy on Write and Page Migration where additional copies of an existing +page are being allocated. + +The helper routine vma_needs_reservation() is called to determine if a +reservation exists for the address within the mapping(vma). See the section +:ref:`Reservation Map Helper Routines ` for detailed +information on what this routine does. +The value returned from vma_needs_reservation() is generally +0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. +If a reservation does not exist, and there is a subpool associated with the +mapping the subpool is consulted to determine if it contains reservations. +If the subpool contains reservations, one can be used for this allocation. +However, in every case the avoid_reserve argument overrides the use of +a reservation for the allocation. After determining whether a reservation +exists and can be used for the allocation, the routine dequeue_huge_page_vma() +is called. This routine takes two arguments related to reservations: + +- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- chg, even though this argument is of type long only the values 0 or 1 are + passed to dequeue_huge_page_vma. If the value is 0, it indicates a + reservation exists (see the section "Memory Policy and Reservations" for + possible issues). If the value is 1, it indicates a reservation does not + exist and the page must be taken from the global free pool if possible. + +The free lists associated with the memory policy of the VMA are searched for +a free page. If a page is found, the value free_huge_pages is decremented +when the page is removed from the free list. If there was a reservation +associated with the page, the following adjustments are made:: + + SetPagePrivate(page); /* Indicates allocating this page consumed + * a reservation, and if an error is + * encountered such that the page must be + * freed, the reservation will be restored. */ + resv_huge_pages--; /* Decrement the global reservation count */ + +Note, if no huge page can be found that satisfies the VMA's memory policy +an attempt will be made to allocate one using the buddy allocator. This +brings up the issue of surplus huge pages and overcommit which is beyond +the scope reservations. Even if a surplus page is allocated, the same +reservation based adjustments as above will be made: SetPagePrivate(page) and +resv_huge_pages--. + +After obtaining a new huge page, (page)->private is set to the value of +the subpool associated with the page if it exists. This will be used for +subpool accounting when the page is freed. + +The routine vma_commit_reservation() is then called to adjust the reserve +map based on the consumption of the reservation. In general, this involves +ensuring the page is represented within a file_region structure of the region +map. For shared mappings where the reservation was present, an entry +in the reserve map already existed so no change is made. However, if there +was no reservation in a shared mapping or this was a private mapping a new +entry must be created. + +It is possible that the reserve map could have been changed between the call +to vma_needs_reservation() at the beginning of alloc_huge_page() and the +call to vma_commit_reservation() after the page was allocated. This would +be possible if hugetlb_reserve_pages was called for the same page in a shared +mapping. In such cases, the reservation count and subpool free page count +will be off by one. This rare condition can be identified by comparing the +return value from vma_needs_reservation and vma_commit_reservation. If such +a race is detected, the subpool and global reserve counts are adjusted to +compensate. See the section +:ref:`Reservation Map Helper Routines ` for more +information on these routines. + + +Instantiate Huge Pages +====================== + +After huge page allocation, the page is typically added to the page tables +of the allocating task. Before this, pages in a shared mapping are added +to the page cache and pages in private mappings are added to an anonymous +reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, +when a huge page that has been instantiated is freed no adjustment is made +to the global reservation count (resv_huge_pages). + + +Freeing Huge Pages +================== + +Huge page freeing is performed by the routine free_huge_page(). This routine +is the destructor for hugetlbfs compound pages. As a result, it is only +passed a pointer to the page struct. When a huge page is freed, reservation +accounting may need to be performed. This would be the case if the page was +associated with a subpool that contained reserves, or the page is being freed +on an error path where a global reserve count must be restored. + +The page->private field points to any subpool associated with the page. +If the PagePrivate flag is set, it indicates the global reserve count should +be adjusted (see the section +:ref:`Consuming Reservations/Allocating a Huge Page ` +for information on how these are set). + +The routine first calls hugepage_subpool_put_pages() for the page. If this +routine returns a value of 0 (which does not equal the value passed 1) it +indicates reserves are associated with the subpool, and this newly free page +must be used to keep the number of subpool reserves above the minimum size. +Therefore, the global resv_huge_pages counter is incremented in this case. + +If the PagePrivate flag was set in the page, the global resv_huge_pages counter +will always be incremented. + +.. _sub_pool_resv: + +Subpool Reservations +==================== + +There is a struct hstate associated with each huge page size. The hstate +tracks all huge pages of the specified size. A subpool represents a subset +of pages within a hstate that is associated with a mounted hugetlbfs +filesystem. + +When a hugetlbfs filesystem is mounted a min_size option can be specified +which indicates the minimum number of huge pages required by the filesystem. +If this option is specified, the number of huge pages corresponding to +min_size are reserved for use by the filesystem. This number is tracked in +the min_hpages field of a struct hugepage_subpool. At mount time, +hugetlb_acct_memory(min_hpages) is called to reserve the specified number of +huge pages. If they can not be reserved, the mount fails. + +The routines hugepage_subpool_get/put_pages() are called when pages are +obtained from or released back to a subpool. They perform all subpool +accounting, and track any reservations associated with the subpool. +hugepage_subpool_get/put_pages are passed the number of huge pages by which +to adjust the subpool 'used page' count (down for get, up for put). Normally, +they return the same value that was passed or an error if not enough pages +exist in the subpool. + +However, if reserves are associated with the subpool a return value less +than the passed value may be returned. This return value indicates the +number of additional global pool adjustments which must be made. For example, +suppose a subpool contains 3 reserved huge pages and someone asks for 5. +The 3 reserved pages associated with the subpool can be used to satisfy part +of the request. But, 2 pages must be obtained from the global pools. To +relay this information to the caller, the value 2 is returned. The caller +is then responsible for attempting to obtain the additional two pages from +the global pools. + + +COW and Reservations +==================== + +Since shared mappings all point to and use the same underlying pages, the +biggest reservation concern for COW is private mappings. In this case, +two tasks can be pointing at the same previously allocated page. One task +attempts to write to the page, so a new page must be allocated so that each +task points to its own page. + +When the page was originally allocated, the reservation for that page was +consumed. When an attempt to allocate a new page is made as a result of +COW, it is possible that no free huge pages are free and the allocation +will fail. + +When the private mapping was originally created, the owner of the mapping +was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation +map of the owner. Since the owner created the mapping, the owner owns all +the reservations associated with the mapping. Therefore, when a write fault +occurs and there is no page available, different action is taken for the owner +and non-owner of the reservation. + +In the case where the faulting task is not the owner, the fault will fail and +the task will typically receive a SIGBUS. + +If the owner is the faulting task, we want it to succeed since it owned the +original reservation. To accomplish this, the page is unmapped from the +non-owning task. In this way, the only reference is from the owning task. +In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer +of the non-owning task. The non-owning task may receive a SIGBUS if it later +faults on a non-present page. But, the original owner of the +mapping/reservation will behave as expected. + + +.. _resv_map_modifications: + +Reservation Map Modifications +============================= + +The following low level routines are used to make modifications to a +reservation map. Typically, these routines are not called directly. Rather, +a reservation map helper routine is called which calls one of these low level +routines. These low level routines are fairly well documented in the source +code (mm/hugetlb.c). These routines are:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +Operations on the reservation map typically involve two operations: + +1) region_chg() is called to examine the reserve map and determine how + many pages in the specified range [f, t) are NOT currently represented. + + The calling code performs global checks and allocations to determine if + there are enough huge pages for the operation to succeed. + +2) + a) If the operation can succeed, region_add() is called to actually modify + the reservation map for the same range [f, t) previously passed to + region_chg(). + b) If the operation can not succeed, region_abort is called for the same + range [f, t) to abort the operation. + +Note that this is a two step process where region_add() and region_abort() +are guaranteed to succeed after a prior call to region_chg() for the same +range. region_chg() is responsible for pre-allocating any data structures +necessary to ensure the subsequent operations (specifically region_add())) +will succeed. + +As mentioned above, region_chg() determines the number of pages in the range +which are NOT currently represented in the map. This number is returned to +the caller. region_add() returns the number of pages in the range added to +the map. In most cases, the return value of region_add() is the same as the +return value of region_chg(). However, in the case of shared mappings it is +possible for changes to the reservation map to be made between the calls to +region_chg() and region_add(). In this case, the return value of region_add() +will not match the return value of region_chg(). It is likely that in such +cases global counts and subpool accounting will be incorrect and in need of +adjustment. It is the responsibility of the caller to check for this condition +and make the appropriate adjustments. + +The routine region_del() is called to remove regions from a reservation map. +It is typically called in the following situations: + +- When a file in the hugetlbfs filesystem is being removed, the inode will + be released and the reservation map freed. Before freeing the reservation + map, all the individual file_region structures must be freed. In this case + region_del is passed the range [0, LONG_MAX). +- When a hugetlbfs file is being truncated. In this case, all allocated pages + after the new file size must be freed. In addition, any file_region entries + in the reservation map past the new end of file must be deleted. In this + case, region_del is passed the range [new_end_of_file, LONG_MAX). +- When a hole is being punched in a hugetlbfs file. In this case, huge pages + are removed from the middle of the file one at a time. As the pages are + removed, region_del() is called to remove the corresponding entry from the + reservation map. In this case, region_del is passed the range + [page_idx, page_idx + 1). + +In every case, region_del() will return the number of pages removed from the +reservation map. In VERY rare cases, region_del() can fail. This can only +happen in the hole punch case where it has to split an existing file_region +entry and can not allocate a new structure. In this error case, region_del() +will return -ENOMEM. The problem here is that the reservation map will +indicate that there is a reservation for the page. However, the subpool and +global reservation counts will not reflect the reservation. To handle this +situation, the routine hugetlb_fix_reserve_counts() is called to adjust the +counters so that they correspond with the reservation map entry that could +not be deleted. + +region_count() is called when unmapping a private huge page mapping. In +private mappings, the lack of a entry in the reservation map indicates that +a reservation exists. Therefore, by counting the number of entries in the +reservation map we know how many reservations were consumed and how many are +outstanding (outstanding = (end - start) - region_count(resv, start, end)). +Since the mapping is going away, the subpool and global reservation counts +are decremented by the number of outstanding reservations. + +.. _resv_map_helpers: + +Reservation Map Helper Routines +=============================== + +Several helper routines exist to query and modify the reservation maps. +These routines are only interested with reservations for a specific huge +page, so they just pass in an address instead of a range. In addition, +they pass in the associated VMA. From the VMA, the type of mapping (private +or shared) and the location of the reservation map (inode or VMA) can be +determined. These routines simply call the underlying routines described +in the section "Reservation Map Modifications". However, they do take into +account the 'opposite' meaning of reservation map entries for private and +shared mappings and hide this detail from the caller:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This routine calls region_chg() for the specified page. If no reservation +exists, 1 is returned. If a reservation exists, 0 is returned:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_add() for the specified page. As in the case of region_chg +and region_add, this routine is to be called after a previous call to +vma_needs_reservation. It will add a reservation entry for the page. It +returns 1 if the reservation was added and 0 if not. The return value should +be compared with the return value of the previous call to +vma_needs_reservation. An unexpected difference indicates the reservation +map was modified between calls:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_abort() for the specified page. As in the case of region_chg +and region_abort, this routine is to be called after a previous call to +vma_needs_reservation. It will abort/end the in progress reservation add +operation:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This is a special wrapper routine to help facilitate reservation cleanup +on error paths. It is only called from the routine restore_reserve_on_error(). +This routine is used in conjunction with vma_needs_reservation in an attempt +to add a reservation to the reservation map. It takes into account the +different reservation map semantics for private and shared mappings. Hence, +region_add is called for shared mappings (as an entry present in the map +indicates a reservation), and region_del is called for private mappings (as +the absence of an entry in the map indicates a reservation). See the section +"Reservation cleanup in error paths" for more information on what needs to +be done on error paths. + + +Reservation Cleanup in Error Paths +================================== + +As mentioned in the section +:ref:`Reservation Map Helper Routines `, reservation +map modifications are performed in two steps. First vma_needs_reservation +is called before a page is allocated. If the allocation is successful, +then vma_commit_reservation is called. If not, vma_end_reservation is called. +Global and subpool reservation counts are adjusted based on success or failure +of the operation and all is well. + +Additionally, after a huge page is instantiated the PagePrivate flag is +cleared so that accounting when the page is ultimately freed is correct. + +However, there are several instances where errors are encountered after a huge +page is allocated but before it is instantiated. In this case, the page +allocation has consumed the reservation and made the appropriate subpool, +reservation map and global count adjustments. If the page is freed at this +time (before instantiation and clearing of PagePrivate), then free_huge_page +will increment the global reservation count. However, the reservation map +indicates the reservation was consumed. This resulting inconsistent state +will cause the 'leak' of a reserved huge page. The global reserve count will +be higher than it should and prevent allocation of a pre-allocated page. + +The routine restore_reserve_on_error() attempts to handle this situation. It +is fairly well documented. The intention of this routine is to restore +the reservation map to the way it was before the page allocation. In this +way, the state of the reservation map will correspond to the global reservation +count after the page is freed. + +The routine restore_reserve_on_error itself may encounter errors while +attempting to restore the reservation map entry. In this case, it will +simply clear the PagePrivate flag of the page. In this way, the global +reserve count will not be incremented when the page is freed. However, the +reservation map will continue to look as though the reservation was consumed. +A page can still be allocated for the address, but it will not use a reserved +page as originally intended. + +There is some code (most notably userfaultfd) which can not call +restore_reserve_on_error. In this case, it simply modifies the PagePrivate +so that a reservation will not be leaked when the huge page is freed. + + +Reservations and Memory Policy +============================== +Per-node huge page lists existed in struct hstate when git was first used +to manage Linux code. The concept of reservations was added some time later. +When reservations were added, no attempt was made to take memory policy +into account. While cpusets are not exactly the same as memory policy, this +comment in hugetlb_acct_memory sums up the interaction between reservations +and cpusets/memory policy:: + + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + +Huge page reservations were added to prevent unexpected page allocation +failures (OOM) at page fault time. However, if an application makes use +of cpusets or memory policy there is no guarantee that huge pages will be +available on the required nodes. This is true even if there are a sufficient +number of global reservations. + +Hugetlbfs regression testing +============================ + +The most complete set of hugetlb tests are in the libhugetlbfs repository. +If you modify any hugetlb related code, use the libhugetlbfs test suite +to check for regressions. In addition, if you add any new hugetlb +functionality, please add appropriate tests to libhugetlbfs. + +-- +Mike Kravetz, 7 April 2017 diff --git a/Documentation/mm/hwpoison.rst b/Documentation/mm/hwpoison.rst new file mode 100644 index 000000000000..b9d5253c1305 --- /dev/null +++ b/Documentation/mm/hwpoison.rst @@ -0,0 +1,184 @@ +.. hwpoison: + +======== +hwpoison +======== + +What is hwpoison? +================= + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery``). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment:: + + High level machine check handler. Handles pages reported by the + hardware as being corrupted usually due to a 2bit ECC memory or cache + failure. + + This focusses on pages detected as corrupted in the background. + When the current CPU tries to consume corruption the currently + running process can just be killed directly instead. This implies + that if the error cannot be handled for some reason it's safe to + just ignore it because no corruption has been consumed yet. Instead + when that happens another machine check will happen. + + Handles page cache pages in various states. The tricky part + here is that we can access any page asynchronous to other VM + users, because memory failures could happen anytime and anywhere, + possibly violating some of their assumptions. This is why this code + has to be extremely careful. Generally it tries to use normal locking + rules, as in get the standard locks, even if that means the + error handling takes potentially a long time. + + Some of the operations here are somewhat inefficient and have non + linear algorithmic complexity, because the data structures have not + been optimized for this case. This is in particular the case + for the mapping from a vma to a process. Since this case is expected + to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +Failure recovery modes +====================== + +There are two (actually three) modes memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +User control +============ + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + + arg1: PR_MCE_KILL_CLEAR: + Revert to system default + arg1: PR_MCE_KILL_SET: + arg2 defines thread specific mode + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + Use system global default + + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + +PR_MCE_KILL_GET + return current mode + +Testing +======= + +* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the + process for testing + +* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` + + corrupt-pfn + Inject hwpoison fault at PFN echoed into this file. This does + some early filtering to avoid corrupted unintended pages in test suites. + + unpoison-pfn + Software-unpoison page at PFN echoed into this file. This way + a page can be reused again. This only works for Linux + injected failures, not for real memory failures. Once any hardware + memory failure happens, this feature is disabled. + + Note these injection interfaces are not stable and might change between + kernel versions + + corrupt-filter-dev-major, corrupt-filter-dev-minor + Only handle memory failures to pages associated with the file + system defined by block device major/minor. -1U is the + wildcard value. This should be only used for testing with + artificial injection. + + corrupt-filter-memcg + Limit injection to pages owned by memgroup. Specified by inode + number of the memcg. + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + When specified, only poison pages if ((page_flags & mask) == + value). This allows stress testing of many kinds of + pages. The page_flags are the same as in /proc/kpageflags. The + flag bits are defined in include/linux/kernel-page-flags.h and + documented in Documentation/admin-guide/mm/pagemap.rst + +* Architecture specific MCE injector + + x86 has mce-inject, mce-test + + Some portable hwpoison test programs in mce-test, see below. + +References +========== + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +Limitations +=========== +- Not all page types are supported and never will. Most kernel internal + objects cannot be recovered, only LRU pages for now. + +--- +Andi Kleen, Oct 2009 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst new file mode 100644 index 000000000000..575ccd40e30c --- /dev/null +++ b/Documentation/mm/index.rst @@ -0,0 +1,68 @@ +===================================== +Linux Memory Management Documentation +===================================== + +Memory Management Guide +======================= + +This is a guide to understanding the memory management subsystem +of Linux. If you are looking for advice on simply allocating memory, +see the :ref:`memory_allocation`. For controlling and tuning guides, +see the :doc:`admin guide <../admin-guide/mm/index>`. + +.. toctree:: + :maxdepth: 1 + + physical_memory + page_tables + process_addrs + bootmem + page_allocation + vmalloc + slab + highmem + page_reclaim + swap + page_cache + shmfs + oom + +Legacy Documentation +==================== + +This is a collection of older documents about the Linux memory management +(MM) subsystem internals with different level of details ranging from +notes and mailing list responses for elaborating descriptions of data +structures and algorithms. It should all be integrated nicely into the +above structured documentation, or deleted if it has served its purpose. + +.. toctree:: + :maxdepth: 1 + + active_mm + arch_pgtable_helpers + balance + damon/index + free_page_reporting + frontswap + hmm + hwpoison + hugetlbfs_reserv + ksm + memory-model + mmu_notifier + numa + overcommit-accounting + page_migration + page_frags + page_owner + page_table_check + remap_file_pages + slub + split_page_table_lock + transhuge + unevictable-lru + vmalloced-kernel-stacks + vmemmap_dedup + z3fold + zsmalloc diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst new file mode 100644 index 000000000000..9e37add068e6 --- /dev/null +++ b/Documentation/mm/ksm.rst @@ -0,0 +1,87 @@ +.. _ksm: + +======================= +Kernel Samepage Merging +======================= + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, +and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ + +The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` + +Design +====== + +Overview +-------- + +.. kernel-doc:: mm/ksm.c + :DOC: Overview + +Reverse mapping +--------------- +KSM maintains reverse mapping information for KSM pages in the stable +tree. + +If a KSM page is shared between less than ``max_page_sharing`` VMAs, +the node of the stable tree that represents such KSM page points to a +list of struct rmap_item and the ``page->mapping`` of the +KSM page points to the stable tree node. + +When the sharing passes this threshold, KSM adds a second dimension to +the stable tree. The tree node becomes a "chain" that links one or +more "dups". Each "dup" keeps reverse mapping information for a KSM +page with ``page->mapping`` pointing to that "dup". + +Every "chain" and all "dups" linked into a "chain" enforce the +invariant that they represent the same write protected memory content, +even if each "dup" will be pointed by a different KSM page copy of +that content. + +This way the stable tree lookup computational complexity is unaffected +if compared to an unlimited list of reverse mappings. It is still +enforced that there cannot be KSM page content duplicates in the +stable tree itself. + +The deduplication limit enforced by ``max_page_sharing`` is required +to avoid the virtual memory rmap lists to grow too large. The rmap +walk has O(N) complexity where N is the number of rmap_items +(i.e. virtual mappings) that are sharing the page, which is in turn +capped by ``max_page_sharing``. So this effectively spreads the linear +O(N) computational complexity from rmap walk context over different +KSM pages. The ksmd walk over the stable_node "chains" is also O(N), +but N is the number of stable_node "dups", not the number of +rmap_items, so it has not a significant impact on ksmd performance. In +practice the best stable_node "dup" candidate will be kept and found +at the head of the "dups" list. + +High values of ``max_page_sharing`` result in faster memory merging +(because there will be fewer stable_node dups queued into the +stable_node chain->hlist to check for pruning) and higher +deduplication factor at the expense of slower worst case for rmap +walks for any KSM page which can happen during swapping, compaction, +NUMA balancing and page migration. + +The ``stable_node_dups/stable_node_chains`` ratio is also affected by the +``max_page_sharing`` tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to free up +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + +The whole list of stable_node "dups" linked in the stable_node +"chains" is scanned periodically in order to prune stale stable_nodes. +The frequency of such scans is defined by +``stable_node_chains_prune_millisecs`` sysfs tunable. + +Reference +--------- +.. kernel-doc:: mm/ksm.c + :functions: mm_slot ksm_scan stable_node rmap_item + +-- +Izik Eidus, +Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst new file mode 100644 index 000000000000..3779e562dc76 --- /dev/null +++ b/Documentation/mm/memory-model.rst @@ -0,0 +1,177 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _physical_memory_model: + +===================== +Physical Memory Model +===================== + +Physical memory in a system may be addressed in different ways. The +simplest case is when the physical memory starts at address 0 and +spans a contiguous range up to the maximal address. It could be, +however, that this range contains small holes that are not accessible +for the CPU. Then there could be several contiguous ranges at +completely distinct addresses. And, don't forget about NUMA, where +different memory banks are attached to different CPUs. + +Linux abstracts this diversity using one of the two memory models: +FLATMEM and SPARSEMEM. Each architecture defines what +memory models it supports, what the default memory model is and +whether it is possible to manually override that default. + +All the memory models track the status of physical page frames using +struct page arranged in one or more arrays. + +Regardless of the selected memory model, there exists one-to-one +mapping between the physical page frame number (PFN) and the +corresponding `struct page`. + +Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` +helpers that allow the conversion from PFN to `struct page` and vice +versa. + +FLATMEM +======= + +The simplest memory model is FLATMEM. This model is suitable for +non-NUMA systems with contiguous, or mostly contiguous, physical +memory. + +In the FLATMEM memory model, there is a global `mem_map` array that +maps the entire physical memory. For most architectures, the holes +have entries in the `mem_map` array. The `struct page` objects +corresponding to the holes are never fully initialized. + +To allocate the `mem_map` array, architecture specific setup code should +call :c:func:`free_area_init` function. Yet, the mappings array is not +usable until the call to :c:func:`memblock_free_all` that hands all the +memory to the page allocator. + +An architecture may free parts of the `mem_map` array that do not cover the +actual physical pages. In such case, the architecture specific +:c:func:`pfn_valid` implementation should take the holes in the +`mem_map` into account. + +With FLATMEM, the conversion between a PFN and the `struct page` is +straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the +`mem_map` array. + +The `ARCH_PFN_OFFSET` defines the first page frame number for +systems with physical memory starting at address different from 0. + +SPARSEMEM +========= + +SPARSEMEM is the most versatile memory model available in Linux and it +is the only memory model that supports several advanced features such +as hot-plug and hot-remove of the physical memory, alternative memory +maps for non-volatile memory devices and deferred initialization of +the memory map for larger systems. + +The SPARSEMEM model presents the physical memory as a collection of +sections. A section is represented with struct mem_section +that contains `section_mem_map` that is, logically, a pointer to an +array of struct pages. However, it is stored with some other magic +that aids the sections management. The section size and maximal number +of section is specified using `SECTION_SIZE_BITS` and +`MAX_PHYSMEM_BITS` constants defined by each architecture that +supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a +physical address that an architecture supports, the +`SECTION_SIZE_BITS` is an arbitrary value. + +The maximal number of sections is denoted `NR_MEM_SECTIONS` and +defined as + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +The `mem_section` objects are arranged in a two-dimensional array +called `mem_sections`. The size and placement of this array depend +on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of +sections: + +* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` + array is static and has `NR_MEM_SECTIONS` rows. Each row holds a + single `mem_section` object. +* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` + array is dynamically allocated. Each row contains PAGE_SIZE worth of + `mem_section` objects and the number of rows is calculated to fit + all the memory sections. + +The architecture setup code should call sparse_init() to +initialize the memory sections and the memory maps. + +With SPARSEMEM there are two possible ways to convert a PFN to the +corresponding `struct page` - a "classic sparse" and "sparse +vmemmap". The selection is made at build time and it is determined by +the value of `CONFIG_SPARSEMEM_VMEMMAP`. + +The classic sparse encodes the section number of a page in page->flags +and uses high bits of a PFN to access the section that maps that page +frame. Inside a section, the PFN is the index to the array of pages. + +The sparse vmemmap uses a virtually mapped memory map to optimize +pfn_to_page and page_to_pfn operations. There is a global `struct +page *vmemmap` pointer that points to a virtually contiguous array of +`struct page` objects. A PFN is an index to that array and the +offset of the `struct page` from `vmemmap` is the PFN of that +page. + +To use vmemmap, an architecture has to reserve a range of virtual +addresses that will map the physical pages containing the memory +map and make sure that `vmemmap` points to that range. In addition, +the architecture should implement :c:func:`vmemmap_populate` method +that will allocate the physical memory and create page tables for the +virtual memory map. If an architecture does not have any special +requirements for the vmemmap mappings, it can use default +:c:func:`vmemmap_populate_basepages` provided by the generic memory +management. + +The virtually mapped memory map allows storing `struct page` objects +for persistent memory devices in pre-allocated storage on those +devices. This storage is represented with struct vmem_altmap +that is eventually passed to vmemmap_populate() through a long chain +of function calls. The vmemmap_populate() implementation may use the +`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to +allocate memory map on the persistent memory device. + +ZONE_DEVICE +=========== +The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer +`struct page` `mem_map` services for device driver identified physical +address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact +that the page objects for these address ranges are never marked online, +and that a reference must be taken against the device, not just the page +to keep the memory pinned for active use. `ZONE_DEVICE`, via +:c:func:`devm_memremap_pages`, performs just enough memory hotplug to +turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and +:c:func:`get_user_pages` service for the given range of pfns. Since the +page reference count never drops below 1 the page is never tracked as +free memory and the page's `struct list_head lru` space is repurposed +for back referencing to the host device / driver that mapped the memory. + +While `SPARSEMEM` presents memory as a collection of sections, +optionally collected into memory blocks, `ZONE_DEVICE` users have a need +for smaller granularity of populating the `mem_map`. Given that +`ZONE_DEVICE` memory is never marked online it is subsequently never +subject to its memory ranges being exposed through the sysfs memory +hotplug api on memory block boundaries. The implementation relies on +this lack of user-api constraint to allow sub-section sized memory +ranges to be specified to :c:func:`arch_add_memory`, the top-half of +memory hotplug. Sub-section support allows for 2MB as the cross-arch +common alignment granularity for :c:func:`devm_memremap_pages`. + +The users of `ZONE_DEVICE` are: + +* pmem: Map platform persistent memory to be used as a direct-I/O target + via DAX mappings. + +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` + event callbacks to allow a device-driver to coordinate memory management + events related to device-memory, typically GPU memory. See + Documentation/mm/hmm.rst. + +* p2pdma: Create `struct page` objects to allow peer devices in a + PCI/-E topology to coordinate direct-DMA operations between themselves, + i.e. bypass host memory. diff --git a/Documentation/mm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst new file mode 100644 index 000000000000..df5d7777fc6b --- /dev/null +++ b/Documentation/mm/mmu_notifier.rst @@ -0,0 +1,99 @@ +.. _mmu_notifier: + +When do you need to notify inside page table lock ? +=================================================== + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {try to write to addrA} + CPU-thread-1 {try to write to addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA and populate device TLB} + DEV-thread-2 {read addrB and populate device TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} + CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {write to addrA which is a write to new page} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {write to addrB which is a write to new page} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA from old page} + DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seeing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/mm/numa.rst b/Documentation/mm/numa.rst new file mode 100644 index 000000000000..99fdeca917ca --- /dev/null +++ b/Documentation/mm/numa.rst @@ -0,0 +1,150 @@ +.. _numa: + +Started Nov 1999 by Kanoj Sarcar + +============= +What is NUMA? +============= + +This question can be answered from a couple of perspectives: the +hardware view and the Linux software view. + +From the hardware perspective, a NUMA system is a computer platform that +comprises multiple components or assemblies each of which may contain 0 +or more CPUs, local memory, and/or IO buses. For brevity and to +disambiguate the hardware view of these physical components/assemblies +from the software abstraction thereof, we'll call the components/assemblies +'cells' in this document. + +Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset +of the system--although some components necessary for a stand-alone SMP system +may not be populated on any given cell. The cells of the NUMA system are +connected together with some sort of system interconnect--e.g., a crossbar or +point-to-point link are common types of NUMA system interconnects. Both of +these types of interconnects can be aggregated to create NUMA platforms with +cells at multiple distances from other cells. + +For Linux, the NUMA platforms of interest are primarily what is known as Cache +Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible +to and accessible from any CPU attached to any cell and cache coherency +is handled in hardware by the processor caches and/or the system interconnect. + +Memory access time and effective memory bandwidth varies depending on how far +away the cell containing the CPU or IO bus making the memory access is from the +cell containing the target memory. For example, access to memory by CPUs +attached to the same cell will experience faster access times and higher +bandwidths than accesses to memory on other, remote cells. NUMA platforms +can have cells at multiple remote distances from any given cell. + +Platform vendors don't build NUMA systems just to make software developers' +lives interesting. Rather, this architecture is a means to provide scalable +memory bandwidth. However, to achieve scalable memory bandwidth, system and +application software must arrange for a large majority of the memory references +[cache misses] to be to "local" memory--memory on the same cell, if any--or +to the closest cell with memory. + +This leads to the Linux software view of a NUMA system: + +Linux divides the system's hardware resources into multiple software +abstractions called "nodes". Linux maps the nodes onto the physical cells +of the hardware platform, abstracting away some of the details for some +architectures. As with physical cells, software nodes may contain 0 or more +CPUs, memory and/or IO buses. And, again, memory accesses to memory on +"closer" nodes--nodes that map to closer cells--will generally experience +faster access times and higher effective bandwidth than accesses to more +remote cells. + +For some architectures, such as x86, Linux will "hide" any node representing a +physical cell that has no memory attached, and reassign any CPUs attached to +that cell to a node representing a cell that does have memory. Thus, on +these architectures, one cannot assume that all CPUs that Linux associates with +a given node will see the same local memory access times and bandwidth. + +In addition, for some architectures, again x86 is an example, Linux supports +the emulation of additional nodes. For NUMA emulation, linux will carve up +the existing nodes--or the system memory for non-NUMA platforms--into multiple +nodes. Each emulated node will manage a fraction of the underlying cells' +physical memory. NUMA emluation is useful for testing NUMA kernel and +application features on non-NUMA platforms, and as a sort of memory resource +management mechanism when used together with cpusets. +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +For each node with memory, Linux constructs an independent memory management +subsystem, complete with its own free page lists, in-use page lists, usage +statistics and locks to mediate access. In addition, Linux constructs for +each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], +an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a +selected zone/node cannot satisfy the allocation request. This situation, +when a zone has no available memory to satisfy a request, is called +"overflow" or "fallback". + +Because some nodes contain multiple zones containing different types of +memory, Linux must decide whether to order the zonelists such that allocations +fall back to the same zone type on a different node, or to a different zone +type on the same node. This is an important consideration because some zones, +such as DMA or DMA32, represent relatively scarce resources. Linux chooses +a default Node ordered zonelist. This means it tries to fallback to other zones +from the same node before using remote nodes which are ordered by NUMA distance. + +By default, Linux will attempt to satisfy memory allocation requests from the +node to which the CPU that executes the request is assigned. Specifically, +Linux will attempt to allocate from the first node in the appropriate zonelist +for the node where the request originates. This is called "local allocation." +If the "local" node cannot satisfy the request, the kernel will examine other +nodes' zones in the selected zonelist looking for the first zone in the list +that can satisfy the request. + +Local allocation will tend to keep subsequent access to the allocated memory +"local" to the underlying physical resources and off the system interconnect-- +as long as the task on whose behalf the kernel allocated some memory does not +later migrate away from that memory. The Linux scheduler is aware of the +NUMA topology of the platform--embodied in the "scheduling domains" data +structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler +attempts to minimize task migration to distant scheduling domains. However, +the scheduler does not take a task's NUMA footprint into account directly. +Thus, under sufficient imbalance, tasks can migrate between nodes, remote +from their initial node and kernel data structures. + +System administrators and application designers can restrict a task's migration +to improve NUMA locality using various CPU affinity command line interfaces, +such as taskset(1) and numactl(1), and program interfaces such as +sched_setaffinity(2). Further, one can modify the kernel's default local +allocation behavior using Linux NUMA memory policy. [see +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +System administrators can restrict the CPUs and nodes' memories that a non- +privileged user can specify in the scheduling or NUMA commands and functions +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +On architectures that do not hide memoryless nodes, Linux will include only +zones [nodes] with memory in the zonelists. This means that for a memoryless +node the "local memory node"--the node of the first zone in CPU's node's +zonelist--will not be the node itself. Rather, it will be the node that the +kernel selected as the nearest node with memory when it built the zonelists. +So, default, local allocations will succeed with the kernel supplying the +closest available memory. This is a consequence of the same mechanism that +allows such allocations to fallback to other nearby nodes when a node that +does contain memory overflows. + +Some kernel allocations do not want or cannot tolerate this allocation fallback +behavior. Rather they want to be sure they get memory from the specified node +or get notified that the node has no free memory. This is usually the case when +a subsystem allocates per CPU memory resources, for example. + +A typical model for making such an allocation is to obtain the node id of the +node to which the "current CPU" is attached using one of the kernel's +numa_node_id() or CPU_to_node() functions and then request memory from only +the node id returned. When such an allocation fails, the requesting subsystem +may revert to its own fallback path. The slab kernel memory allocator is an +example of this. Or, the subsystem may choose to disable or not to enable +itself on allocation failure. The kernel profiling subsystem is an example of +this. + +If the architecture supports--does not hide--memoryless nodes, then CPUs +attached to memoryless nodes would always incur the fallback path overhead +or some subsystems would fail to initialize if they attempted to allocated +memory exclusively from a node without memory. To support such +architectures transparently, kernel subsystems can use the numa_mem_id() +or cpu_to_mem() function to locate the "local memory node" for the calling or +specified CPU. Again, this is the same node from which default, local page +allocations will be attempted. diff --git a/Documentation/mm/oom.rst b/Documentation/mm/oom.rst new file mode 100644 index 000000000000..18e9e40c1ec1 --- /dev/null +++ b/Documentation/mm/oom.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Out Of Memory Handling +====================== diff --git a/Documentation/mm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..1addb0c374a4 --- /dev/null +++ b/Documentation/mm/overcommit-accounting.rst @@ -0,0 +1,88 @@ +.. _overcommit_accounting: + +===================== +Overcommit Accounting +===================== + +The Linux kernel supports the following overcommit handling modes + +0 + Heuristic overcommit handling. Obvious overcommits of address + space are refused. Used for a typical system. It ensures a + seriously wild allocation fails while allowing overcommit to + reduce swap usage. root is allowed to allocate slightly more + memory in this mode. This is the default. + +1 + Always overcommit. Appropriate for some scientific + applications. Classic example is code using sparse arrays and + just relying on the virtual memory consisting almost entirely + of zero pages. + +2 + Don't overcommit. The total address space commit for the + system is not permitted to exceed swap + a configurable amount + (default is 50%) of physical RAM. Depending on the amount you + use, in most situations this means a process will not be + killed while accessing pages but will receive errors on memory + allocation as appropriate. + + Useful for applications that want to guarantee their memory + allocations will be available in the future without having to + initialize every page. + +The overcommit policy is set via the sysctl ``vm.overcommit_memory``. + +The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) +or ``vm.overcommit_kbytes`` (absolute value). These only have an effect +when ``vm.overcommit_memory`` is set to 2. + +The current overcommit limit and amount committed are viewable in +``/proc/meminfo`` as CommitLimit and Committed_AS respectively. + +Gotchas +======= + +The C language stack growth does an implicit mremap. If you want absolute +guarantees and run close to the edge you MUST mmap your stack for the +largest size you think you will need. For typical stack usage this does +not matter much but it's a corner case if you really really care + +In mode 2 the MAP_NORESERVE flag is ignored. + + +How It Works +============ + +The overcommit is based on the following rules + +For a file backed map + | SHARED or READ-only - 0 cost (the file is the map not swap) + | PRIVATE WRITABLE - size of mapping per instance + +For an anonymous or ``/dev/zero`` map + | SHARED - size of mapping + | PRIVATE READ-only - 0 cost (but of little use) + | PRIVATE WRITABLE - size of mapping per instance + +Additional accounting + | Pages made writable copies by mmap + | shmfs memory drawn from the same pool + +Status +====== + +* We account mmap memory mappings +* We account mprotect changes in commit +* We account mremap changes in size +* We account brk +* We account munmap +* We report the commit status in /proc +* Account and check on fork +* Review stack handling/building on exec +* SHMfs accounting +* Implement actual limit enforcement + +To Do +===== +* Account ptrace pages (this is hard) diff --git a/Documentation/mm/page_allocation.rst b/Documentation/mm/page_allocation.rst new file mode 100644 index 000000000000..d9b4495561f1 --- /dev/null +++ b/Documentation/mm/page_allocation.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Page Allocation +=============== diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst new file mode 100644 index 000000000000..75eba7c431b2 --- /dev/null +++ b/Documentation/mm/page_cache.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Page Cache +========== diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst new file mode 100644 index 000000000000..7d6f9385d129 --- /dev/null +++ b/Documentation/mm/page_frags.rst @@ -0,0 +1,45 @@ +.. _page_frags: + +============== +Page fragments +============== + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst new file mode 100644 index 000000000000..8c5cb8147e55 --- /dev/null +++ b/Documentation/mm/page_migration.rst @@ -0,0 +1,288 @@ +.. _page_migration: + +============== +Page migration +============== + +Page migration allows moving the physical location of pages between +nodes in a NUMA system while the process is running. This means that the +virtual addresses that the process sees do not change. However, the +system rearranges the physical location of those pages. + +Also see :ref:`Heterogeneous Memory Management (HMM) ` +for migrating pages to or from device private memory. + +The main intent of page migration is to reduce the latency of memory accesses +by moving pages near to the processor where the process accessing that memory +is running. + +Page migration allows a process to manually relocate the node on which its +pages are located through the MF_MOVE and MF_MOVE_ALL options while setting +a new memory policy via mbind(). The pages of a process can also be relocated +from another process using the sys_migrate_pages() function call. The +migrate_pages() function call takes two sets of nodes and moves pages of a +process that are located on the from nodes to the destination nodes. +Page migration functions are provided by the numactl package by Andi Kleen +(a version later than 0.9.3 is required. Get it from +https://github.com/numactl/numactl.git). numactl provides libnuma +which provides an interface similar to other NUMA functionality for page +migration. cat ``/proc//numa_maps`` allows an easy review of where the +pages of a process are located. See also the numa_maps documentation in the +proc(5) man page. + +Manual migration is useful if for example the scheduler has relocated +a process to a processor on a distant node. A batch scheduler or an +administrator may detect the situation and move the pages of the process +nearer to the new processor. The kernel itself only provides +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +For example, A NUMA profiler may obtain a log showing frequent off-node +accesses and may use the result to move pages to more advantageous +locations. + +Larger installations usually partition the system using cpusets into +sections of nodes. Paul Jackson has equipped cpusets with the ability to +move pages when a task is moved to another cpuset (See +:ref:`CPUSETS `). +Cpusets allow the automation of process locality. If a task is moved to +a new cpuset then also all its pages are moved with it so that the +performance of the process does not sink dramatically. Also the pages +of processes in a cpuset are moved if the allowed memory nodes of a +cpuset are changed. + +Page migration allows the preservation of the relative location of pages +within a group of nodes for all migration techniques which will preserve a +particular memory allocation pattern generated even after migrating a +process. This is necessary in order to preserve the memory latencies. +Processes will run with similar performance after migration. + +Page migration occurs in several steps. First a high level +description for those trying to use migrate_pages() from the kernel +(for userspace usage see the Andi Kleen's numactl package mentioned above) +and then a low level description of how the low level details work. + +In kernel use of migrate_pages() +================================ + +1. Remove pages from the LRU. + + Lists of pages to be migrated are generated by scanning over + pages and moving them into lists. This is done by + calling isolate_lru_page(). + Calling isolate_lru_page() increases the references to the page + so that it cannot vanish while the page migration occurs. + It also prevents the swapper or other scans from encountering + the page. + +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. + +3. The migrate_pages() function is called which attempts + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. + +How migrate_pages() works +========================= + +migrate_pages() does several passes over its list of pages. A page is moved +if all references to a page are removable at the time. The page has +already been removed from the LRU via isolate_lru_page() and the refcount +is increased so that the page cannot be freed while page migration occurs. + +Steps: + +1. Lock the page to be migrated. + +2. Ensure that writeback is complete. + +3. Lock the new page that we want to move to. It is locked so that accesses to + this (not yet up-to-date) page immediately block while the move is in progress. + +4. All the page table references to the page are converted to migration + entries. This decreases the mapcount of a page. If the resulting + mapcount is not zero then we do not migrate the page. All user space + processes that attempt to access the page will now wait on the page lock + or wait for the migration page table entry to be removed. + +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. + +6. The refcount of the page is examined and we back out if references remain. + Otherwise, we know that we are the only one referencing this page. + +7. The radix tree is checked and if it does not contain the pointer to this + page then we back out because someone else modified the radix tree. + +8. The new page is prepped with some settings from the old page so that + accesses to the new page will discover a page with the correct settings. + +9. The radix tree is changed to point to the new page. + +10. The reference count of the old page is dropped because the address space + reference is gone. A reference to the new page is established because + the new page is referenced by the address space. + +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock + to sleeping on the locked new page. + +12. The page contents are copied to the new page. + +13. The remaining page flags are copied to the new page. + +14. The old page flags are cleared to indicate that the page does + not provide any information anymore. + +15. Queued up writeback on the new page is triggered. + +16. If migration entries were inserted into the page table, then replace them + with real ptes. Doing so will enable access for user space processes not + already waiting for the page lock. + +17. The page locks are dropped from the old and new page. + Processes waiting on the page lock will redo their page faults + and will reach the new page. + +18. The new page is moved to the LRU and can be scanned by the swapper, + etc. again. + +Non-LRU page migration +====================== + +Although migration originally aimed for reducing the latency of memory accesses +for NUMA, compaction also uses migration to create high-order pages. + +Current problem of the implementation is that it is designed to migrate only +*LRU* pages. However, there are potential non-LRU pages which can be migrated +in drivers, for example, zsmalloc, virtio-balloon pages. + +For virtio-balloon pages, some parts of migration code path have been hooked +up and added virtio-balloon specific functions to intercept migration logics. +It's too specific to a driver so other drivers who want to make their pages +movable would have to add their own specific hooks in the migration path. + +To overcome the problem, VM supports non-LRU page migration which provides +generic functions for non-LRU movable pages without driver specific hooks +in the migration path. + +If a driver wants to make its pages movable, it should define three functions +which are function pointers of struct address_space_operations. + +1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` + + What VM expects from isolate_page() function of driver is to return *true* + if driver isolates the page successfully. On returning true, VM marks the page + as PG_isolated so concurrent isolation in several CPUs skip the page + for isolation. If a driver cannot isolate the page, it should return *false*. + + Once page is successfully isolated, VM uses page.lru fields so driver + shouldn't expect to preserve values in those fields. + +2. ``int (*migratepage) (struct address_space *mapping,`` +| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` + + After isolation, VM calls migratepage() of driver with the isolated page. + The function of migratepage() is to move the contents of the old page to the + new page + and set up fields of struct page newpage. Keep in mind that you should + indicate to the VM the oldpage is no longer movable via __ClearPageMovable() + under page_lock if you migrated the oldpage successfully and returned + MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver + can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time + because VM interprets -EAGAIN as "temporary migration failure". On returning + any error except -EAGAIN, VM will give up the page migration without + retrying. + + Driver shouldn't touch the page.lru field while in the migratepage() function. + +3. ``void (*putback_page)(struct page *);`` + + If migration fails on the isolated page, VM should return the isolated page + to the driver so VM calls the driver's putback_page() with the isolated page. + In this function, the driver should put the isolated page back into its own data + structure. + +Non-LRU movable page flags + + There are two page flags for supporting non-LRU movable page. + + * PG_movable + + Driver should use the function below to make page movable under page_lock:: + + void __SetPageMovable(struct page *page, struct address_space *mapping) + + It needs argument of address_space for registering migration + family functions which will be called by VM. Exactly speaking, + PG_movable is not a real flag of struct page. Rather, VM + reuses the page->mapping's lower bits to represent it:: + + #define PAGE_MAPPING_MOVABLE 0x2 + page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; + + so driver shouldn't access page->mapping directly. Instead, driver should + use page_mapping() which masks off the low two bits of page->mapping under + page lock so it can get the right struct address_space. + + For testing of non-LRU movable pages, VM supports __PageMovable() function. + However, it doesn't guarantee to identify non-LRU movable pages because + the page->mapping field is unified with other variables in struct page. + If the driver releases the page after isolation by VM, page->mapping + doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set + (look at __ClearPageMovable). But __PageMovable() is cheap to call whether + page is LRU or non-LRU movable once the page has been isolated because LRU + pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also + good for just peeking to test non-LRU movable pages before more expensive + checking with lock_page() in pfn scanning to select a victim. + + For guaranteeing non-LRU movable page, VM provides PageMovable() function. + Unlike __PageMovable(), PageMovable() validates page->mapping and + mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents + sudden destroying of page->mapping. + + Drivers using __SetPageMovable() should clear the flag via + __ClearMovablePage() under page_lock() before the releasing the page. + + * PG_isolated + + To prevent concurrent isolation among several CPUs, VM marks isolated page + as PG_isolated under lock_page(). So if a CPU encounters PG_isolated + non-LRU movable page, it can skip it. Driver doesn't need to manipulate the + flag because VM will set/clear it automatically. Keep in mind that if the + driver sees a PG_isolated page, it means the page has been isolated by the + VM so it shouldn't touch the page.lru field. + The PG_isolated flag is aliased with the PG_reclaim flag so drivers + shouldn't use PG_isolated for its own purposes. + +Monitoring Migration +===================== + +The following events (counters) can be used to monitor page migration. + +1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. + +2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for + PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, + if it was a THP or hugetlb. + +3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. + +4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. + +5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had + to be split. After splitting, a migration retry was used for it's sub-pages. + +THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or +PGMIGRATE_FAIL events. For example, a THP migration failure will cause both +THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. + +Christoph Lameter, May 8, 2006. +Minchan Kim, Mar 28, 2016. diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst new file mode 100644 index 000000000000..f5c954afe97c --- /dev/null +++ b/Documentation/mm/page_owner.rst @@ -0,0 +1,196 @@ +.. _page_owner: + +================================================== +page owner: Tracking about who allocated each page +================================================== + +Introduction +============ + +page owner is for the tracking about who allocated each page. +It can be used to debug memory leak or to find a memory hogger. +When allocation happens, information about allocation such as call stack +and order of pages is stored into certain storage for each page. +When we need to know about status of all pages, we can get and analyze +this information. + +Although we already have tracepoint for tracing page allocation/free, +using it for analyzing who allocate each page is rather complex. We need +to enlarge the trace buffer for preventing overlapping until userspace +program launched. And, launched program continually dump out the trace +buffer for later analysis and it would change system behaviour with more +possibility rather than just keeping it in memory, so bad for debugging. + +page owner can also be used for various purposes. For example, accurate +fragmentation statistics can be obtained through gfp flag information of +each page. It is already implemented and activated if page owner is +enabled. Other usages are more than welcome. + +page owner is disabled by default. So, if you'd like to use it, you need +to add "page_owner=on" to your boot cmdline. If the kernel is built +with page owner and page owner is disabled in runtime due to not enabling +boot option, runtime overhead is marginal. If disabled in runtime, it +doesn't require memory to store owner information, so there is no runtime +memory overhead. And, page owner inserts just two unlikely branches into +the page allocator hotpath and if not enabled, then allocation is done +like as the kernel without page owner. These two unlikely branches should +not affect to allocation performance, especially if the static keys jump +label patching functionality is available. Following is the kernel's code +size change due to this facility. + +- Without page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- With page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +Although, roughly, 8 KB code is added in total, page_alloc.o increase by +520 bytes and less than half of it is in hotpath. Building the kernel with +page owner and turning it on if needed would be great option to debug +kernel memory problem. + +There is one notice that is caused by implementation detail. page owner +stores information into the memory from struct page extension. This memory +is initialized some time later than that page allocator starts in sparse +memory system, so, until initialization, many pages can be allocated and +they would have no owner information. To fix it up, these early allocated +pages are investigated and marked as allocated in initialization phase. +Although it doesn't mean that they have the right owner information, +at least, we can tell whether the page is allocated or not, +more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages +are catched and marked, although they are mostly allocated from struct +page extension feature. Anyway, after that, no page is left in +un-tracking state. + +Usage +===== + +1) Build user-space helper:: + + cd tools/vm + make page_owner_sort + +2) Enable page owner: add "page_owner=on" to boot cmdline. + +3) Do the job that you want to debug. + +4) Analyze information from page owner:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + The general output of ``page_owner_full.txt`` is as follows:: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the parameter(s). + + See the result about who allocated each page + in the ``sorted_page_owner.txt``. General output:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: + + fundamental function:: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -P Sort by tgid. + -n Sort by task command name. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. + Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is + optional since default direction is increasing numerical or lexicographic + order. Mixed use of abbreviated and complete-form of keys is allowed. + + Examples: + ./page_owner_sort --sort=n,+pid,-tgid + ./page_owner_sort --sort=at + + additional function:: + + Cull: + --cull + Specify culling rules.Culling syntax is key[,key[,...]].Choose a + multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. + + is a single argument in the form of a comma-separated list, + which offers a way to specify individual culling rules. The recognized + keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. + can be specified by the sequence of keys k1,k2, ..., as described in + the STANDARD SORT KEYS section below. Mixed use of abbreviated and + complete-form of keys is allowed. + + Examples: + ./page_owner_sort --cull=stacktrace + ./page_owner_sort --cull=st,pid,name + ./page_owner_sort --cull=n,f + + Filter: + -f Filter out the information of blocks whose memory has been released. + + Select: + --pid Select by pid. This selects the blocks whose process ID + numbers appear in . + --tgid Select by tgid. This selects the blocks whose thread + group ID numbers appear in . + --name Select by task command name. This selects the blocks whose + task command name appear in . + + , , are single arguments in the form of a comma-separated list, + which offers a way to specify individual selecting rules. + + + Examples: + ./page_owner_sort --pid=1 + ./page_owner_sort --tgid=1,2,3 + ./page_owner_sort --name name1,name2 + +STANDARD FORMAT SPECIFIERS +========================== +:: + + For --sort option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + st stacktrace stack trace of the page allocation + T txt full text of block + ft free_ts timestamp of the page when it was released + at alloc_ts timestamp of the page when it was allocated + ator allocator memory allocator for pages + + For --curl option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + f free whether the page has been released or not + st stacktrace stack trace of the page allocation + ator allocator memory allocator for pages diff --git a/Documentation/mm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst new file mode 100644 index 000000000000..50a30b7f8ac3 --- /dev/null +++ b/Documentation/mm/page_reclaim.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Page Reclaim +============ diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst new file mode 100644 index 000000000000..1a09472f10a3 --- /dev/null +++ b/Documentation/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to harden the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/Documentation/mm/page_tables.rst b/Documentation/mm/page_tables.rst new file mode 100644 index 000000000000..96939571d7bc --- /dev/null +++ b/Documentation/mm/page_tables.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Page Tables +=========== diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst new file mode 100644 index 000000000000..2ab7b8c1c863 --- /dev/null +++ b/Documentation/mm/physical_memory.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Physical Memory +=============== diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst new file mode 100644 index 000000000000..e8618fbc62c9 --- /dev/null +++ b/Documentation/mm/process_addrs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Process Addresses +================= diff --git a/Documentation/mm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst new file mode 100644 index 000000000000..7bef6718e3a9 --- /dev/null +++ b/Documentation/mm/remap_file_pages.rst @@ -0,0 +1,33 @@ +.. _remap_file_pages: + +============================== +remap_file_pages() system call +============================== + +The remap_file_pages() system call is used to create a nonlinear mapping, +that is, a mapping in which the pages of the file are mapped into a +nonsequential order in memory. The advantage of using remap_file_pages() +over using repeated calls to mmap(2) is that the former approach does not +require the kernel to create additional VMA (Virtual Memory Area) data +structures. + +Supporting of nonlinear mapping requires significant amount of non-trivial +code in kernel virtual memory subsystem including hot paths. Also to get +nonlinear mapping work kernel need a way to distinguish normal page table +entries from entries with file offset (pte_file). Kernel reserves flag in +PTE for this purpose. PTE flags are scarce resource especially on some CPU +architectures. It would be nice to free up the flag for other usage. + +Fortunately, there are not many users of remap_file_pages() in the wild. +It's only known that one enterprise RDBMS implementation uses the syscall +on 32-bit systems to map files bigger than can linearly fit into 32-bit +virtual address space. This use-case is not critical anymore since 64-bit +systems are widely available. + +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. + +One side effect of emulation (apart from performance) is that user can hit +vm.max_map_count limit more easily due to additional VMAs. See comment for +DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/mm/shmfs.rst b/Documentation/mm/shmfs.rst new file mode 100644 index 000000000000..8b01ebb4c30e --- /dev/null +++ b/Documentation/mm/shmfs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Shared Memory Filesystem +======================== diff --git a/Documentation/mm/slab.rst b/Documentation/mm/slab.rst new file mode 100644 index 000000000000..87d5a5bb172f --- /dev/null +++ b/Documentation/mm/slab.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Slab Allocation +=============== diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst new file mode 100644 index 000000000000..43063ade737a --- /dev/null +++ b/Documentation/mm/slub.rst @@ -0,0 +1,452 @@ +.. _slub: + +========================== +Short users guide for SLUB +========================== + +The basic philosophy of SLUB is very different from SLAB. SLAB +requires rebuilding the kernel to activate debug options for all +slab caches. SLUB always includes full debugging but it is off by default. +SLUB can enable debugging only for selected slabs in order to avoid +an impact on overall system performance which may make a bug more +difficult to find. + +In order to switch debugging on one can add an option ``slub_debug`` +to the kernel command line. That will enable full debugging for +all slabs. + +Typically one would then use the ``slabinfo`` command to get statistical +data and perform operation on the slabs. By default ``slabinfo`` only lists +slabs that have data in them. See "slabinfo -h" for more options when +running the command. ``slabinfo`` can be compiled with +:: + + gcc -o slabinfo tools/vm/slabinfo.c + +Some of the modes of operation of ``slabinfo`` require that slub debugging +be enabled on the command line. F.e. no tracking information will be +available without debugging on and validation can only partially +be performed if debugging was not switched on. + +Some more sophisticated uses of slub_debug: +------------------------------------------- + +Parameters may be given to ``slub_debug``. If none is specified then full +debugging is enabled. Format: + +slub_debug= + Enable options for all slabs + +slub_debug=,,,... + Enable options only for select slabs (no spaces + after a comma) + +Multiple blocks of options for all slabs or selected slabs can be given, with +blocks of options delimited by ';'. The last of "all slabs" blocks is applied +to all slabs except those that match one of the "select slabs" block. Options +of the first "select slabs" blocks that matches the slab's name are applied. + +Possible debug options are:: + + F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS + Sorry SLAB legacy issues) + Z Red zoning + P Poisoning (object and padding) + U User tracking (free and alloc) + T Trace (please only use on single slabs) + A Enable failslab filter mark for the cache + O Switch debugging off for caches that would have + caused higher minimum slab orders + - Switch all debugging off (useful if the kernel is + configured with CONFIG_SLUB_DEBUG_ON) + +F.e. in order to boot just with sanity checks and red zoning one would specify:: + + slub_debug=FZ + +Trying to find an issue in the dentry cache? Try:: + + slub_debug=,dentry + +to only enable debugging on the dentry cache. You may use an asterisk at the +end of the slab name, in order to cover all slabs with the same prefix. For +example, here's how you can poison the dentry cache as well as all kmalloc +slabs:: + + slub_debug=P,kmalloc-*,dentry + +Red zoning and tracking may realign the slab. We can just apply sanity checks +to the dentry cache with:: + + slub_debug=F,dentry + +Debugging options may require the minimum possible slab order to increase as +a result of storing the metadata (for example, caches with PAGE_SIZE object +sizes). This has a higher liklihood of resulting in slab allocation errors +in low memory situations or if there's high fragmentation of memory. To +switch off debugging for such caches by default, use:: + + slub_debug=O + +You can apply different options to different list of slab names, using blocks +of options. This will enable red zoning for dentry and user tracking for +kmalloc. All other slabs will not get any debugging enabled:: + + slub_debug=Z,dentry;U,kmalloc-* + +You can also enable options (e.g. sanity checks and poisoning) for all caches +except some that are deemed too performance critical and don't need to be +debugged by specifying global debug options followed by a list of slab names +with "-" as options:: + + slub_debug=FZ;-,zs_handle,zspage + +The state of each debug option for a slab can be found in the respective files +under:: + + /sys/kernel/slab// + +If the file contains 1, the option is enabled, 0 means disabled. The debug +options from the ``slub_debug`` parameter translate to the following files:: + + F sanity_checks + Z red_zone + P poison + U store_user + T trace + A failslab + +Careful with tracing: It may spew out lots of information and never stop if +used on the wrong slab. + +Slab merging +============ + +If no debug options are specified then SLUB may merge similar slabs together +in order to reduce overhead and increase cache hotness of objects. +``slabinfo -a`` displays which slabs were merged together. + +Slab validation +=============== + +SLUB can validate all object if the kernel was booted with slub_debug. In +order to do so you must have the ``slabinfo`` tool. Then you can do +:: + + slabinfo -v + +which will test all objects. Output will be generated to the syslog. + +This also works in a more limited way if boot was without slab debug. +In that case ``slabinfo -v`` simply tests all reachable objects. Usually +these are in the cpu slabs and the partial slabs. Full slabs are not +tracked by SLUB in a non debug situation. + +Getting more performance +======================== + +To some degree SLUB's performance is limited by the need to take the +list_lock once in a while to deal with partial slabs. That overhead is +governed by the order of the allocation for each slab. The allocations +can be influenced by kernel parameters: + +.. slub_min_objects=x (default 4) +.. slub_min_order=x (default 0) +.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) + +``slub_min_objects`` + allows to specify how many objects must at least fit into one + slab in order for the allocation order to be acceptable. In + general slub will be able to perform this number of + allocations on a slab without consulting centralized resources + (list_lock) where contention may occur. + +``slub_min_order`` + specifies a minimum order of slabs. A similar effect like + ``slub_min_objects``. + +``slub_max_order`` + specified the order at which ``slub_min_objects`` should no + longer be checked. This is useful to avoid SLUB trying to + generate super large order pages to fit ``slub_min_objects`` + of a slab cache with large object sizes into one high order + page. Setting command line parameter + ``debug_guardpage_minorder=N`` (N > 0), forces setting + ``slub_max_order`` to 0, what cause minimum possible order of + slabs allocation. + +SLUB Debug output +================= + +Here is a sample of slub debug output:: + + ==================================================================== + BUG kmalloc-8: Right Redzone overwritten + -------------------------------------------------------------------- + + INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc + INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 + INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 + INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 + + Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ + Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 + Redzone (0xc90f6d28): 00 cc cc cc . + Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + + [] dump_trace+0x63/0x1eb + [] show_trace_log_lvl+0x1a/0x2f + [] show_trace+0x12/0x14 + [] dump_stack+0x16/0x18 + [] object_err+0x143/0x14b + [] check_object+0x66/0x234 + [] __slab_free+0x239/0x384 + [] kfree+0xa6/0xc6 + [] get_modalias+0xb9/0xf5 + [] dmi_dev_uevent+0x27/0x3c + [] dev_uevent+0x1ad/0x1da + [] kobject_uevent_env+0x20a/0x45b + [] kobject_uevent+0xa/0xf + [] store_uevent+0x4f/0x58 + [] dev_attr_store+0x29/0x2f + [] sysfs_write_file+0x16e/0x19c + [] vfs_write+0xd1/0x15a + [] sys_write+0x3d/0x72 + [] sysenter_past_esp+0x5f/0x99 + [] 0xb7f7b410 + ======================= + + FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc + +If SLUB encounters a corrupted object (full detection requires the kernel +to be booted with slub_debug) then the following output will be dumped +into the syslog: + +1. Description of the problem encountered + + This will be a message in the system log starting with:: + + =============================================== + BUG : + ----------------------------------------------- + + INFO: - + INFO: Slab
+ INFO: Object
+ INFO: Allocated in age= cpu= pid= + INFO: Freed in age= cpu= + pid= + + (Object allocation / free information is only available if SLAB_STORE_USER is + set for the slab. slub_debug sets that option) + +2. The object contents if an object was involved. + + Various types of lines can follow the BUG SLUB line: + + Bytes b4
: + Shows a few bytes before the object where the problem was detected. + Can be useful if the corruption does not stop with the start of the + object. + + Object
: + The bytes of the object. If the object is inactive then the bytes + typically contain poison values. Any non-poison value shows a + corruption by a write after free. + + Redzone
: + The Redzone following the object. The Redzone is used to detect + writes after the object. All bytes should always have the same + value. If there is any deviation then it is due to a write after + the object boundary. + + (Redzone information is only available if SLAB_RED_ZONE is set. + slub_debug sets that option) + + Padding
: + Unused data to fill up the space in order to get the next object + properly aligned. In the debug case we make sure that there are + at least 4 bytes of padding. This allows the detection of writes + before the object. + +3. A stackdump + + The stackdump describes the location where the error was detected. The cause + of the corruption is may be more likely found by looking at the function that + allocated or freed the object. + +4. Report on how the problem was dealt with in order to ensure the continued + operation of the system. + + These are messages in the system log beginning with:: + + FIX : + + In the above sample SLUB found that the Redzone of an active object has + been overwritten. Here a string of 8 characters was written into a slab that + has the length of 8 characters. However, a 8 character string needs a + terminating 0. That zero has overwritten the first byte of the Redzone field. + After reporting the details of the issue encountered the FIX SLUB message + tells us that SLUB has restored the Redzone to its proper value and then + system operations continue. + +Emergency operations +==================== + +Minimal debugging (sanity checks alone) can be enabled by booting with:: + + slub_debug=F + +This will be generally be enough to enable the resiliency features of slub +which will keep the system running even if a bad kernel component will +keep corrupting objects. This may be important for production systems. +Performance will be impacted by the sanity checks and there will be a +continual stream of error messages to the syslog but no additional memory +will be used (unlike full debugging). + +No guarantees. The kernel component still needs to be fixed. Performance +may be optimized further by locating the slab that experiences corruption +and enabling debugging only for that cache + +I.e.:: + + slub_debug=F,dentry + +If the corruption occurs by writing after the end of the object then it +may be advisable to enable a Redzone to avoid corrupting the beginning +of other objects:: + + slub_debug=FZ,dentry + +Extended slabinfo mode and plotting +=================================== + +The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: + - Slabcache Totals + - Slabs sorted by size (up to -N slabs, default 1) + - Slabs sorted by loss (up to -N slabs, default 1) + +Additionally, in this mode ``slabinfo`` does not dynamically scale +sizes (G/M/K) and reports everything in bytes (this functionality is +also available to other slabinfo modes via '-B' option) which makes +reporting more precise and accurate. Moreover, in some sense the `-X' +mode also simplifies the analysis of slabs' behaviour, because its +output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it +pushes the analysis from looking through the numbers (tons of numbers) +to something easier -- visual analysis. + +To generate plots: + +a) collect slabinfo extended records, for example:: + + while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done + +b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: + + slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] + + The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records + and generates 3 png files (and 3 pre-processing cache files) per STATS + file: + - Slabcache Totals: FOO_STATS-totals.png + - Slabs sorted by size: FOO_STATS-slabs-by-size.png + - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png + +Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you +need to compare slabs' behaviour "prior to" and "after" some code +modification. To help you out there, ``slabinfo-gnuplot.sh`` script +can 'merge' the `Slabcache Totals` sections from different +measurements. To visually compare N plots: + +a) Collect as many STATS1, STATS2, .. STATSN files as you need:: + + while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done + +b) Pre-process those STATS files:: + + slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN + +c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the + generated pre-processed \*-totals:: + + slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals + + This will produce a single plot (png file). + + Plots, expectedly, can be large so some fluctuations or small spikes + can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two + options to 'zoom-in'/'zoom-out': + + a) ``-s %d,%d`` -- overwrites the default image width and height + b) ``-r %d,%d`` -- specifies a range of samples to use (for example, + in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r + 40,60`` range will plot only samples collected between 40th and + 60th seconds). + + +DebugFS files for SLUB +====================== + +For more information about current state of SLUB caches with the user tracking +debug option enabled, debugfs files are available, typically under +/sys/kernel/debug/slab// (created only for caches with enabled user +tracking). There are 2 types of these files with the following debug +information: + +1. alloc_traces:: + + Prints information about unique allocation traces of the currently + allocated objects. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, allocating function, minimal/average/maximal jiffies since alloc, + pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + + Example::: + + 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: + __slab_alloc+0x6d/0x90 + kmem_cache_alloc_trace+0x2eb/0x300 + populate_error_injection_list+0x97/0x110 + init_error_injection+0x1b/0x71 + do_one_initcall+0x5f/0x2d0 + kernel_init_freeable+0x26f/0x2d7 + kernel_init+0xe/0x118 + ret_from_fork+0x22/0x30 + + +2. free_traces:: + + Prints information about unique freeing traces of the currently allocated + objects. The freeing traces thus come from the previous life-cycle of the + objects and are reported as not available for objects allocated for the first + time. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, freeing function, minimal/average/maximal jiffies since free, + pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. + + Example::: + + 1980 age=4294912290 pid=0 cpus=0 + 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 + kfree+0x2db/0x420 + acpi_ut_update_ref_count+0x6a6/0x782 + acpi_ut_update_object_reference+0x1ad/0x234 + acpi_ut_remove_reference+0x7d/0x84 + acpi_rs_get_prt_method_data+0x97/0xd6 + acpi_get_irq_routing_table+0x82/0xc4 + acpi_pci_irq_find_prt_entry+0x8e/0x2e0 + acpi_pci_irq_lookup+0x3a/0x1e0 + acpi_pci_irq_enable+0x77/0x240 + pcibios_enable_device+0x39/0x40 + do_pci_enable_device.part.0+0x5d/0xe0 + pci_enable_device_flags+0xfc/0x120 + pci_enable_device+0x13/0x20 + virtio_pci_probe+0x9e/0x170 + local_pci_probe+0x48/0x80 + pci_device_probe+0x105/0x1c0 + +Christoph Lameter, May 30, 2007 +Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..c08919662704 --- /dev/null +++ b/Documentation/mm/split_page_table_lock.rst @@ -0,0 +1,100 @@ +.. _split_page_table_lock: + +===================== +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables are guarded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +================================= + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +=================================================== + +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and +pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing +paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +========= + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/mm/swap.rst b/Documentation/mm/swap.rst new file mode 100644 index 000000000000..78819bd4d745 --- /dev/null +++ b/Documentation/mm/swap.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +Swap +==== diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst new file mode 100644 index 000000000000..216db1d67d04 --- /dev/null +++ b/Documentation/mm/transhuge.rst @@ -0,0 +1,187 @@ +.. _transhuge: + +============================ +Transparent Hugepage Support +============================ + +This document describes design principles for Transparent Hugepage (THP) +support and its interaction with other parts of the memory management +system. + +Design principles +================= + +- "graceful fallback": mm components which don't have transparent hugepage + knowledge fall back to breaking huge pmd mapping into table of ptes and, + if necessary, split a transparent hugepage. Therefore these components + can continue working on the regular pages or regular pte mappings. + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +get_user_pages and follow_page +============================== + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most GUP users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead. Taking a reference on any head/tail page would +prevent the page from being split by anyone. + +.. note:: + these aren't new constraints to the GUP API, and they match the + same constraints that apply to hugetlbfs too, so any driver capable + of handling GUP on hugetlbfs will also work fine on transparent + hugepage backed mappings. + +Graceful fallback +================= + +Code walking pagetables but unaware about huge pmds can simply call +split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundreds if not thousands of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +that you can't handle natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. split_huge_page() can fail +if the page is pinned and you must handle this correctly. + +Example to make mremap.c transparent hugepage aware with a one liner +change:: + + diff --git a/mm/mremap.c b/mm/mremap.c + --- a/mm/mremap.c + +++ b/mm/mremap.c + @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); + + split_huge_pmd(vma, pmd, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +Locking in hugepage aware code +============================== + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_lock in read (or write) mode to be sure a huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_lock in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the +page table lock will prevent the huge pmd being converted into a +regular pmd from under you (split_huge_pmd can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page table lock and fallback to the old code as +before. Otherwise, you can proceed to process the huge pmd and the +hugepage natively. Once finished, you can drop the page table lock. + +Refcounts and transparent huge pages +==================================== + +Refcounting on THP is mostly consistent with refcounting on other compound +pages: + + - get_page()/put_page() and GUP operate on head page's ->_refcount. + + - ->_refcount in tail pages is always zero: get_page_unless_zero() never + succeeds on tail pages. + + - map/unmap of the pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page. + + - map/unmap of the whole compound page is accounted for in compound_mapcount + (stored in first tail page). For file huge pages, we also increment + ->_mapcount of all sub-pages in order to have race-free detection of + last unmap of subpages. + +PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. + +For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all +subpages is offset up by one. This additional reference is required to +get race-free detection of unmap of subpages when we have them mapped with +both PMDs and PTEs. + +This optimization is required to lower the overhead of per-subpage mapcount +tracking. The alternative is to alter ->_mapcount in all subpages on each +map/unmap of the whole compound page. + +For anonymous pages, we set PG_double_map when a PMD of the page is split +for the first time, but still have a PMD mapping. The additional references +go away with the last compound_mapcount. + +File pages get PG_double_map set on the first map of the page with PTE and +goes away when the page gets evicted from the page cache. + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the page +structures. It can be done easily for refcounts taken by page table +entries, but we don't have enough information on how to distribute any +additional pins (i.e. from get_user_pages). split_huge_page() fails any +requests to split pinned huge pages: it expects page count to be equal to +the sum of mapcount of all sub-pages plus one (split_huge_page caller must +have a reference to the head page). + +split_huge_page uses migration entries to stabilize page->_refcount and +page->_mapcount of anonymous pages. File pages just get unmapped. + +We are safe against physical memory scanners too: the only legitimate way +a scanner can get a reference to a page is get_page_unless_zero(). + +All tail pages have zero ->_refcount until atomic_add(). This prevents the +scanner from getting a reference to the tail page up to that point. After the +atomic_add() we don't care about the ->_refcount value. We already know how +many references should be uncharged from the head page. + +For head page get_page_unless_zero() will succeed and we don't mind. It's +clear where references should go after split: it will stay on the head page. + +Note that split_huge_pmd() doesn't have any limitations on refcounting: +pmd can be split at any point and never fails. + +Partial unmap and deferred_split_huge_page() +============================================ + +Unmapping part of THP (with munmap() or other way) is not going to free +memory immediately. Instead, we detect that a subpage of THP is not in use +in page_remove_rmap() and queue the THP for splitting if memory pressure +comes. Splitting will free up unused subpages. + +Splitting the page right away is not an option due to locking context in +the place where we can detect partial unmap. It also might be +counterproductive since in many cases partial unmap happens during exit(2) if +a THP crosses a VMA boundary. + +The function deferred_split_huge_page() is used to queue a page for splitting. +The splitting itself will happen when we get memory pressure via shrinker +interface. diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst new file mode 100644 index 000000000000..b280367d6a44 --- /dev/null +++ b/Documentation/mm/unevictable-lru.rst @@ -0,0 +1,554 @@ +.. _unevictable_lru: + +============================== +Unevictable LRU Infrastructure +============================== + +.. contents:: :local: + + +Introduction +============ + +This document describes the Linux memory manager's "Unevictable LRU" +infrastructure and the use of this to manage several types of "unevictable" +pages. + +The document attempts to provide the overall rationale behind this mechanism +and the rationale for some of the design decisions that drove the +implementation. The latter design rationale is discussed in the context of an +implementation description. Admittedly, one can obtain the implementation +details - the "what does it do?" - by reading the code. One hopes that the +descriptions below add value by provide the answer to "why does it do that?". + + + +The Unevictable LRU +=================== + +The Unevictable LRU facility adds an additional LRU list to track unevictable +pages and to hide these pages from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with page +reclaim in Linux. The problems have been observed at customer sites on large +memory x86_64 systems. + +To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of +main memory will have over 32 million 4k pages in a single node. When a large +fraction of these pages are not evictable for any reason [see below], vmscan +will spend a lot of time scanning the LRU lists looking for the small fraction +of pages that are evictable. This can result in a situation where all CPUs are +spending 100% of their time in vmscan for hours or days on end, with the system +completely unresponsive. + +The unevictable list addresses the following classes of unevictable pages: + + * Those owned by ramfs. + + * Those mapped into SHM_LOCK'd shared memory regions. + + * Those mapped into VM_LOCKED [mlock()ed] VMAs. + +The infrastructure may also be able to handle other conditions that make pages +unevictable, either by definition or by circumstance, in the future. + + +The Unevictable LRU Page List +----------------------------- + +The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a +companion to the LRU-ordered anonymous and file, active and inactive page lists; +and now it is not even a page list. But following familiar convention, here in +this document and in the source, we often imagine it as a fifth LRU page list. + +The Unevictable LRU infrastructure consists of an additional, per-node, LRU list +called the "unevictable" list and an associated page flag, PG_unevictable, to +indicate that the page is being managed on the unevictable list. + +The PG_unevictable flag is analogous to, and mutually exclusive with, the +PG_active flag in that it indicates on which LRU list a page resides when +PG_lru is set. + +The Unevictable LRU infrastructure maintains unevictable pages as if they were +on an additional LRU list for a few reasons: + + (1) We get to "treat unevictable pages just like we treat other pages in the + system - which means we get to use the same code to manipulate them, the + same code to isolate them (for migrate, etc.), the same code to keep track + of the statistics, etc..." [Rik van Riel] + + (2) We want to be able to migrate unevictable pages between nodes for memory + defragmentation, workload management and memory hotplug. The Linux kernel + can only migrate pages that it can successfully isolate from the LRU + lists (or "Movable" pages: outside of consideration here). If we were to + maintain pages elsewhere than on an LRU-like list, where they can be + detected by isolate_lru_page(), we would prevent their migration. + +The unevictable list does not differentiate between file-backed and anonymous, +swap-backed pages. This differentiation is only important while the pages are, +in fact, evictable. + +The unevictable list benefits from the "arrayification" of the per-node LRU +lists and statistics originally proposed and posted by Christoph Lameter. + + +Memory Control Group Interaction +-------------------------------- + +The unevictable LRU facility interacts with the memory control group [aka +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by +extending the lru_list enum. + +The memory controller data structure automatically gets a per-node unevictable +list as a result of the "arrayification" of the per-node LRU lists (one per +lru_list enum element). The memory controller tracks the movement of pages to +and from the unevictable list. + +When a memory control group comes under memory pressure, the controller will +not attempt to reclaim pages on the unevictable list. This has a couple of +effects: + + (1) Because the pages are "hidden" from reclaim on the unevictable list, the + reclaim process can be more efficient, dealing only with pages that have a + chance of being reclaimed. + + (2) On the other hand, if too many of the pages charged to the control group + are unevictable, the evictable portion of the working set of the tasks in + the control group may not fit into the available memory. This can cause + the control group to thrash or to OOM-kill tasks. + + +.. _mark_addr_space_unevict: + +Marking Address Spaces Unevictable +---------------------------------- + +For facilities such as ramfs none of the pages attached to the address space +may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE +address space flag is provided, and this can be manipulated by a filesystem +using a number of wrapper functions: + + * ``void mapping_set_unevictable(struct address_space *mapping);`` + + Mark the address space as being completely unevictable. + + * ``void mapping_clear_unevictable(struct address_space *mapping);`` + + Mark the address space as being evictable. + + * ``int mapping_unevictable(struct address_space *mapping);`` + + Query the address space, and return true if it is completely + unevictable. + +These are currently used in three places in the kernel: + + (1) By ramfs to mark the address spaces of its inodes when they are created, + and this mark remains for the life of the inode. + + (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. + Note that SHM_LOCK is not required to page in the locked pages if they're + swapped out; the application must touch the pages manually if it wants to + ensure they're in memory. + + (3) By the i915 driver to mark pinned address space until it's unpinned. The + amount of unevictable memory marked by i915 driver is roughly the bounded + object size in debugfs/dri/0/i915_gem_objects. + + +Detecting Unevictable Pages +--------------------------- + +The function page_evictable() in mm/internal.h determines whether a page is +evictable or not using the query function outlined above [see section +:ref:`Marking address spaces unevictable `] +to check the AS_UNEVICTABLE flag. + +For address spaces that are so marked after being populated (as SHM regions +might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate +the page tables for the region as does, for example, mlock(), nor need it make +any special effort to push any pages in the SHM_LOCK'd area to the unevictable +list. Instead, vmscan will do this if and when it encounters the pages during +a reclamation scan. + +On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan +the pages in the region and "rescue" them from the unevictable list if no other +condition is keeping them unevictable. If an unevictable region is destroyed, +the pages are also "rescued" from the unevictable list in the process of +freeing them. + +page_evictable() also checks for mlocked pages by testing an additional page +flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is +faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. + + +Vmscan's Handling of Unevictable Pages +-------------------------------------- + +If unevictable pages are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the pages until they +have become evictable again (via munlock() for example) and have been "rescued" +from the unevictable list. However, there may be situations where we decide, +for the sake of expediency, to leave an unevictable page on one of the regular +active/inactive LRU lists for vmscan to deal with. vmscan checks for such +pages in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such pages that it encounters: that is, it diverts those pages to the +unevictable list for the memory cgroup and node being scanned. + +There may be situations where a page is mapped into a VM_LOCKED VMA, but the +page is not marked as PG_mlocked. Such pages will make it all the way to +shrink_active_list() or shrink_page_list() where they will be detected when +vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page +is culled to the unevictable list when it is released by the shrinker. + +To "cull" an unevictable page, vmscan simply puts the page back on the LRU list +using putback_lru_page() - the inverse operation to isolate_lru_page() - after +dropping the page lock. Because the condition which makes the page unevictable +may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the +unevictable state of a page before placing it on the unevictable list. + + +MLOCKED Pages +============= + +The unevictable page list is also useful for mlock(), in addition to ramfs and +SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in +NOMMU situations, all mappings are effectively mlocked. + + +History +------- + +The "Unevictable mlocked Pages" infrastructure is based on work originally +posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". +Nick posted his patch as an alternative to a patch posted by Christoph Lameter +to achieve the same objective: hiding mlocked pages from vmscan. + +In Nick's patch, he used one of the struct page LRU list link fields as a count +of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years +earlier). But this use of the link field for a count prevented the management +of the pages on an LRU list, and thus mlocked pages were not migratable as +isolate_lru_page() could not detect them, and the LRU list link field was not +available to the migration subsystem. + +Nick resolved this by putting mlocked pages back on the LRU list before +attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When +Nick's patch was integrated with the Unevictable LRU work, the count was +replaced by walking the reverse map when munlocking, to determine whether any +other VM_LOCKED VMAs still mapped the page. + +However, walking the reverse map for each page when munlocking was ugly and +inefficient, and could lead to catastrophic contention on a file's rmap lock, +when many processes which had it mlocked were trying to exit. In 5.18, the +idea of keeping mlock_count in Unevictable LRU list link field was revived and +put to work, without preventing the migration of mlocked pages. This is why +the "Unevictable LRU list" cannot be a linked list of pages now; but there was +no use for that linked list anyway - though its size is maintained for meminfo. + + +Basic Management +---------------- + +mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable +pages. When such a page has been "noticed" by the memory management subsystem, +the page is marked with the PG_mlocked flag. This can be manipulated using the +PageMlocked() functions. + +A PG_mlocked page will be placed on the unevictable list when it is added to +the LRU. Such pages can be "noticed" by memory management in several places: + + (1) in the mlock()/mlock2()/mlockall() system call handlers; + + (2) in the mmap() system call handler when mmapping a region with the + MAP_LOCKED flag; + + (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE + flag; + + (4) in the fault path and when a VM_LOCKED stack segment is expanded; or + + (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). + +mlocked pages become unlocked and rescued from the unevictable list when: + + (1) mapped in a range unlocked via the munlock()/munlockall() system calls; + + (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including + unmapping at task exit; + + (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; + or + + (4) before a page is COW'd in a VM_LOCKED VMA. + + +mlock()/mlock2()/mlockall() System Call Handling +------------------------------------------------ + +mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() +for each VMA in the range specified by the call. In the case of mlockall(), +this is the entire active address space of the task. Note that mlock_fixup() +is used for both mlocking and munlocking a range of memory. A call to mlock() +an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is +treated as a no-op and mlock_fixup() simply returns. + +If the VMA passes some filtering as described in "Filtering Special VMAs" +below, mlock_fixup() will attempt to merge the VMA with its neighbors or split +off a subset of the VMA if the range does not cover the entire VMA. Any pages +already present in the VMA are then marked as mlocked by mlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). + +Before returning from the system call, do_mlock() or mlockall() will call +__mm_populate() to fault in the remaining pages via get_user_pages() and to +mark those pages as mlocked as they are faulted. + +Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, +get_user_pages() will be unable to fault in the pages. That's okay. If pages +do end up getting faulted into this VM_LOCKED VMA, they will be handled in the +fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. + +For each PTE (or PMD) being faulted into a VMA, the page add rmap function +calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +(unless it is a PTE mapping of a part of a transparent huge page). Or when +it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() +calls mlock_new_page() instead: similar to mlock_page(), but can make better +judgments, since this page is held exclusively and known not to be on LRU yet. + +mlock_page() sets PageMlocked immediately, then places the page on the CPU's +mlock pagevec, to batch up the rest of the work to be done under lru_lock by +__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +and moves the page to unevictable state ("the unevictable LRU", but with +mlock_count in place of LRU threading). Or if the page was already PageLRU +and PageUnevictable and PageMlocked, it simply increments the mlock_count. + +But in practice that may not work ideally: the page may not yet be on an LRU, or +it may have been temporarily isolated from LRU. In such cases the mlock_count +field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: +rather than risk stranding a page indefinitely as unevictable, always err with +mlock_count on the low side, so that when munlocked the page will be rescued to +an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a +VM_LOCKED VMA. + + +Filtering Special VMAs +---------------------- + +mlock_fixup() filters several classes of "special" VMAs: + +1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind + these mappings are inherently pinned, so we don't need to mark them as + mlocked. In any case, most of the pages have no struct page in which to so + mark the page. Because of this, get_user_pages() will fail for these VMAs, + so there is no sense in attempting to visit them. + +2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We + neither need nor want to mlock() these pages. But __mm_populate() includes + hugetlbfs ranges, allocating the huge pages and populating the PTEs. + +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages are inherently + unevictable and are not managed on the LRU lists. __mm_populate() includes + these ranges, populating the PTEs if not already populated. + +4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() + includes these ranges, populating the PTEs if not already populated. + +Note that for all of these special VMAs, mlock_fixup() does not set the +VM_LOCKED flag. Therefore, we won't have to deal with them later during +munlock(), munmap() or task exit. Neither does mlock_fixup() account these +VMAs against the task's "locked_vm". + + +munlock()/munlockall() System Call Handling +------------------------------------------- + +The munlock() and munlockall() system calls are handled by the same +mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. +If called to munlock an already munlocked VMA, mlock_fixup() simply returns. +Because of the VMA filtering discussed above, VM_LOCKED will not be set in +any "special" VMAs. So, those VMAs will be ignored for munlock. + +If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the +specified range. All pages in the VMA are then munlocked by munlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same +function used when mlocking a VMA range, with new flags for the VMA indicating +that it is munlock() being performed. + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Migrating MLOCKED Pages +----------------------- + +A page that is being migrated has been isolated from the LRU lists and is held +locked across unmapping of the page, updating the page's address space entry +and copying the contents and state, until the page table entry has been +replaced with an entry that refers to the new page. Linux supports migration +of mlocked pages and other unevictable pages. PG_mlocked is cleared from the +the old page when it is unmapped from the last VM_LOCKED VMA, and set when the +new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page +was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the +page was unevictable for other reasons, PG_unevictable is copied explicitly. + +Note that page migration can race with mlocking or munlocking of the same page. +There is mostly no problem since page migration requires unmapping all PTEs of +the old page (including munlock where VM_LOCKED), then mapping in the new page +(including mlock where VM_LOCKED). The page table locks provide sufficient +synchronization. + +However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, +before mlocking any pages already present, if one of those pages were migrated +before mlock_pte_range() reached it, it would get counted twice in mlock_count. +To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, +so that mlock_vma_page() will skip it. + +To complete page migration, we place the old and new pages back onto the LRU +afterwards. The "unneeded" page - old page on success, new page on failure - +is freed when the reference count held by the migration process is released. + + +Compacting MLOCKED Pages +------------------------ + +The memory map can be scanned for compactable regions and the default behavior +is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed +controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work +of compaction is mostly handled by the page migration code and the same work +flow as described in Migrating MLOCKED Pages will apply. + + +MLOCKING Transparent Huge Pages +------------------------------- + +A transparent huge page is represented by a single entry on an LRU list. +Therefore, we can only make unevictable an entire compound page, not +individual subpages. + +If a user tries to mlock() part of a huge page, and no user mlock()s the +whole of the huge page, we want the rest of the page to be reclaimable. + +We cannot just split the page on partial mlock() as split_huge_page() can +fail and a new intermittent failure mode for the syscall is undesirable. + +We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: +the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. + +This way the huge page is accessible for vmscan. Under memory pressure the +page will be split, subpages which belong to VM_LOCKED VMAs will be moved +to the unevictable LRU and the rest can be reclaimed. + +/proc/meminfo's Unevictable and Mlocked amounts do not include those parts +of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. + + +mmap(MAP_LOCKED) System Call Handling +------------------------------------- + +In addition to the mlock(), mlock2() and mlockall() system calls, an application +can request that a region of memory be mlocked by supplying the MAP_LOCKED flag +to the mmap() call. There is one important and subtle difference here, though. +mmap() + mlock() will fail if the range cannot be faulted in (e.g. because +mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. +The mmaped area will still have properties of the locked area - pages will not +get swapped out - but major page faults to fault memory in might still happen. + +Furthermore, any mmap() call or brk() call that expands the heap by a task +that has previously called mlockall() with the MCL_FUTURE flag will result +in the newly mapped memory being mlocked. Before the unevictable/mlock +changes, the kernel simply called make_pages_present() to allocate pages +and populate the page table. + +To mlock a range of memory under the unevictable/mlock infrastructure, +the mmap() handler and task address space expansion functions call +populate_vma_page_range() specifying the vma and the address range to mlock. + + +munmap()/exit()/exec() System Call Handling +------------------------------------------- + +When unmapping an mlocked region of memory, whether by an explicit call to +munmap() or via an internal unmap from exit() or exec() processing, we must +munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. + +For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Truncating MLOCKED Pages +------------------------ + +File truncation or hole punching forcibly unmaps the deleted pages from +userspace; truncation even unmaps and deletes any private anonymous pages +which had been Copied-On-Write from the file pages now being truncated. + +Mlocked pages can be munlocked and deleted in this way: like with munmap(), +for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +However, if there is a racing munlock(), since mlock_vma_pages_range() starts +munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages +present, if one of those pages were unmapped by truncation or hole punch before +mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, +and would not be counted out of mlock_count. In this rare case, a page may +still appear as PageMlocked after it has been fully unmapped: and it is left to +release_pages() (or __page_cache_release()) to clear it and update statistics +before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, +which is usually 0). + + +Page Reclaim in shrink_*_list() +------------------------------- + +vmscan's shrink_active_list() culls any obviously unevictable pages - +i.e. !page_evictable(page) pages - diverting those to the unevictable list. +However, shrink_active_list() only sees unevictable pages that made it onto the +active/inactive LRU lists. Note that these pages do not have PageUnevictable +set - otherwise they would be on the unevictable list and shrink_active_list() +would never see them. + +Some examples of these unevictable pages on the LRU lists are: + + (1) ramfs pages that have been placed on the LRU lists when first allocated. + + (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to + allocate or fault in the pages in the shared memory region. This happens + when an application accesses the page the first time after SHM_LOCK'ing + the segment. + + (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, + but events left mlock_count too low, so they were munlocked too early. + +vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously +unevictable pages found on the inactive lists to the appropriate memory cgroup +and node unevictable list. + +rmap's page_referenced_one(), called via vmscan's shrink_active_list() or +shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +to correct them. Such pages are culled to the unevictable list when released +by the shrinker. diff --git a/Documentation/mm/vmalloc.rst b/Documentation/mm/vmalloc.rst new file mode 100644 index 000000000000..363fe20d6b9f --- /dev/null +++ b/Documentation/mm/vmalloc.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== +Virtually Contiguous Memory Allocation +====================================== diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst new file mode 100644 index 000000000000..fc8c67833af6 --- /dev/null +++ b/Documentation/mm/vmalloced-kernel-stacks.rst @@ -0,0 +1,153 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtually Mapped Kernel Stack Support +===================================== + +:Author: Shuah Khan + +.. contents:: :local: + +Overview +-------- + +This is a compilation of information from the code and original patch +series that introduced the `Virtually Mapped Kernel Stacks feature +` + +Introduction +------------ + +Kernel stack overflows are often hard to debug and make the kernel +susceptible to exploits. Problems could show up at a later time making +it difficult to isolate and root-cause. + +Virtually-mapped kernel stacks with guard pages causes kernel stack +overflows to be caught immediately rather than causing difficult to +diagnose corruptions. + +HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable +support for virtually mapped stacks with guard pages. This feature +causes reliable faults when the stack overflows. The usability of +the stack trace after overflow and response to the overflow itself +is architecture dependent. + +.. note:: + As of this writing, arm64, powerpc, riscv, s390, um, and x86 have + support for VMAP_STACK. + +HAVE_ARCH_VMAP_STACK +-------------------- + +Architectures that can support Virtually Mapped Kernel Stacks should +enable this bool configuration option. The requirements are: + +- vmalloc space must be large enough to hold many kernel stacks. This + may rule out many 32-bit architectures. +- Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. +- If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +VMAP_STACK +---------- + +VMAP_STACK bool configuration option when enabled allocates virtually +mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. + +- Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be caught + immediately rather than causing difficult-to-diagnose corruption. + +.. note:: + + Using this feature with KASAN requires architecture support + for backing virtual mappings with real shadow memory, and + KASAN_VMALLOC must be enabled. + +.. note:: + + VMAP_STACK is enabled, it is not possible to run DMA on stack + allocated data. + +Kernel configuration options and dependencies keep changing. Refer to +the latest code base: + +`Kconfig ` + +Allocation +----------- + +When a new kernel thread is created, thread stack is allocated from +virtually contiguous memory pages from the page level allocator. These +pages are mapped into contiguous kernel virtual space with PAGE_KERNEL +protections. + +alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack +with PAGE_KERNEL protections. + +- Allocated stacks are cached and later reused by new threads, so memcg + accounting is performed manually on assigning/releasing stacks to tasks. + Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. +- vm_struct is cached to be able to find when thread free is initiated + in interrupt context. free_thread_stack() can be called in interrupt + context. +- On arm64, all VMAP's stacks need to have the same alignment to ensure + that VMAP'd stack overflow detection works correctly. Arch specific + vmap stack allocator takes care of this detail. +- This does not address interrupt stacks - according to the original patch + +Thread stack allocation is initiated from clone(), fork(), vfork(), +kernel_thread() via kernel_clone(). Leaving a few hints for searching +the code base to understand when and how thread stack is allocated. + +Bulk of the code is in: +`kernel/fork.c `. + +stack_vm_area pointer in task_struct keeps track of the virtually allocated +stack and a non-null stack_vm_area pointer serves as a indication that the +virtually mapped kernel stacks are enabled. + +:: + + struct vm_struct *stack_vm_area; + +Stack overflow handling +----------------------- + +Leading and trailing guard pages help detect stack overflows. When stack +overflows into the guard pages, handlers have to be careful not overflow +the stack again. When handlers are called, it is likely that very little +stack space is left. + +On x86, this is done by handling the page fault indicating the kernel +stack overflow on the double-fault stack. + +Testing VMAP allocation with guard pages +---------------------------------------- + +How do we ensure that VMAP_STACK is actually allocating with a leading +and trailing guard page? The following lkdtm tests can help detect any +regressions. + +:: + + void lkdtm_STACK_GUARD_PAGE_LEADING() + void lkdtm_STACK_GUARD_PAGE_TRAILING() + +Conclusions +----------- + +- A percpu cache of vmalloced stacks appears to be a bit faster than a + high-order stack allocation, at least when the cache hits. +- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and + simply embed the thread_info (containing only flags) and 'int cpu' into + task_struct. +- The thread stack can be free'ed as soon as the task is dead (without + waiting for RCU) and then, if vmapped stacks are in use, cache the + entire stack for reuse on the same cpu. diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst new file mode 100644 index 000000000000..c9c495f62d12 --- /dev/null +++ b/Documentation/mm/vmemmap_dedup.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +A vmemmap diet for HugeTLB and Device DAX +========================================= + +HugeTLB +======= + +The struct page structures (page structs) are used to describe a physical +page frame. By default, there is a one-to-one mapping from a page frame to +it's corresponding page struct. + +HugeTLB pages consist of multiple base page size pages and is supported by many +architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more +details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are +currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page +consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. +For each base page, there is a corresponding page struct. + +Within the HugeTLB subsystem, only the first 4 page structs are used to +contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides +this upper limit. The only 'useful' information in the remaining page structs +is the compound_head field, and this field is the same for all tail pages. + +By removing redundant page structs for HugeTLB pages, memory can be returned +to the buddy allocator for other uses. + +Different architectures support different HugeTLB pages. For example, the +following table is the HugeTLB page size supported by x86 and arm64 +architectures. Because arm64 supports 4k, 16k, and 64k base pages and +supports contiguous entries, so it supports many kinds of sizes of HugeTLB +page. + ++--------------+-----------+-----------------------------------------------+ +| Architecture | Page Size | HugeTLB Page Size | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| x86-64 | 4KB | 2MB | 1GB | | | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| | 4KB | 64KB | 2MB | 32MB | 1GB | +| +-----------+-----------+-----------+-----------+-----------+ +| arm64 | 16KB | 2MB | 32MB | 1GB | | +| +-----------+-----------+-----------+-----------+-----------+ +| | 64KB | 2MB | 512MB | 16GB | | ++--------------+-----------+-----------+-----------+-----------+-----------+ + +When the system boot up, every HugeTLB page has more than one struct page +structs which size is (unit: pages):: + + struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + +Where HugeTLB_Size is the size of the HugeTLB page. We know that the size +of the HugeTLB page is always n times PAGE_SIZE. So we can get the following +relationship:: + + HugeTLB_Size = n * PAGE_SIZE + +Then:: + + struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + = n * sizeof(struct page) / PAGE_SIZE + +We can use huge mapping at the pud/pmd level for the HugeTLB page. + +For the HugeTLB page of the pmd level mapping, then:: + + struct_size = n * sizeof(struct page) / PAGE_SIZE + = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + = sizeof(struct page) / sizeof(pte_t) + = 64 / 8 + = 8 (pages) + +Where n is how many pte entries which one page can contains. So the value of +n is (PAGE_SIZE / sizeof(pte_t)). + +This optimization only supports 64-bit system, so the value of sizeof(pte_t) +is 8. And this optimization also applicable only when the size of struct page +is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the +size of struct page structs of it is 8 page frames which size depends on the +size of the base page. + +For the HugeTLB page of the pud level mapping, then:: + + struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + = PAGE_SIZE / 8 * 8 (pages) + = PAGE_SIZE (pages) + +Where the struct_size(pmd) is the size of the struct page structs of a +HugeTLB page of the pmd level mapping. + +E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB +HugeTLB page consists in 4096. + +Next, we take the pmd level mapping of the HugeTLB page as an example to +show the internal implementation of this optimization. There are 8 pages +struct page structs associated with a HugeTLB page which is pmd mapped. + +Here is how things look before optimization:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | -------------> | 2 | + | | +-----------+ +-----------+ + | | | 3 | -------------> | 3 | + | | +-----------+ +-----------+ + | | | 4 | -------------> | 4 | + | PMD | +-----------+ +-----------+ + | level | | 5 | -------------> | 5 | + | mapping | +-----------+ +-----------+ + | | | 6 | -------------> | 6 | + | | +-----------+ +-----------+ + | | | 7 | -------------> | 7 | + | | +-----------+ +-----------+ + | | + | | + | | + +-----------+ + +The value of page->compound_head is the same for all tail pages. The first +page of page structs (page 0) associated with the HugeTLB page contains the 4 +page structs necessary to describe the HugeTLB. The only use of the remaining +pages of page structs (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +will be used for each HugeTLB page. This will allow us to free the remaining +7 pages to the buddy allocator. + +Here is how things look after remapping:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | | + | | | 2 | -----------------+ | | | | | + | | +-----------+ | | | | | + | | | 3 | -------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | ---------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | -----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | -------------------------+ | + | | +-----------+ | + | | | 7 | ---------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +When a HugeTLB is freed to the buddy system, we should allocate 7 pages for +vmemmap pages and restore the previous mapping relationship. + +For the HugeTLB page of the pud level mapping. It is similar to the former. +We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. + +Apart from the HugeTLB page of the pmd/pud level mapping, some architectures +(e.g. aarch64) provides a contiguous bit in the translation table entries +that hints to the MMU to indicate that it is one of a contiguous set of +entries that can be cached in a single TLB entry. + +The contiguous bit is used to increase the mapping size at the pmd and pte +(last) level. So this type of HugeTLB page can be optimized only when its +size of the struct page structs is greater than 1 page. + +Notice: The head vmemmap page is not freed to the buddy allocator and all +tail vmemmap pages are mapped to the head vmemmap page frame. So we can see +more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) +associated with each HugeTLB page. The compound_head() can handle this +correctly (more details refer to the comment above compound_head()). + +Device DAX +========== + +The device-dax interface uses the same tail deduplication technique explained +in the previous chapter, except when used with the vmemmap in +the device (altmap). + +The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), +PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). + +The differences with HugeTLB are relatively minor. + +It only use 3 page structs for storing all information as opposed +to 4 on HugeTLB pages. + +There's no remapping of vmemmap given that device-dax memory is not part of +System RAM ranges initialized at boot. Thus the tail page deduplication +happens at a later stage when we populate the sections. HugeTLB reuses the +the head vmemmap page representing, whereas device-dax reuses the tail +vmemmap page. This results in only half of the savings compared to HugeTLB. + +Deduplicated tail pages are not mapped read-only. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/Documentation/mm/z3fold.rst b/Documentation/mm/z3fold.rst new file mode 100644 index 000000000000..224e3c61d686 --- /dev/null +++ b/Documentation/mm/z3fold.rst @@ -0,0 +1,30 @@ +.. _z3fold: + +====== +z3fold +====== + +z3fold is a special purpose allocator for storing compressed pages. +It is designed to store up to three compressed pages per physical page. +It is a zbud derivative which allows for higher compression +ratio keeping the simplicity and determinism of its predecessor. + +The main differences between z3fold and zbud are: + +* unlike zbud, z3fold allows for up to PAGE_SIZE allocations +* z3fold can hold up to 3 compressed pages in its page +* z3fold doesn't export any API itself and is thus intended to be used + via the zpool API. + +To keep the determinism and simplicity, z3fold, just like zbud, always +stores an integral number of compressed pages per page, but it can store +up to 3 pages unlike zbud which can store at most 2. Therefore the +compression ratio goes to around 2.7x while zbud's one is around 1.7x. + +Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not +return a dereferenceable pointer. Instead, it returns an unsigned long +handle which encodes actual location of the allocated object. + +Keeping effective compression ratio close to zsmalloc's, z3fold doesn't +depend on MMU enabled and provides more predictable reclaim behavior +which makes it a better fit for small and response-critical systems. diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst new file mode 100644 index 000000000000..6e79893d6132 --- /dev/null +++ b/Documentation/mm/zsmalloc.rst @@ -0,0 +1,82 @@ +.. _zsmalloc: + +======== +zsmalloc +======== + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +==== + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + index +size + object size zspage stores +almost_empty + the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full + the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated + the number of objects allocated +obj_used + the number of objects allocated to the user +pages_used + the number of pages allocated for the class +pages_per_zspage + the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where + +* n = number of allocated objects +* N = total number of objects zspage can store +* f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst index 0c8276109fc0..30c69e1f44fe 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst @@ -13,7 +13,7 @@ 监测数据访问 ============ -:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, +:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, 用户可以分析他们系统的内存访问模式,并优化它们。 .. toctree:: diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst index 1500bdbf338a..c976f3e33ffd 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst @@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做,这样我们就可以退回到基于LRU列 .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index eee0e8c5c368..cd41ada4fdad 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 :ref:`sysfs interface `。 - *内核空间编程接口。* - :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 + :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 - 详细情况请参考接口 :doc:`文件 `。 + 详细情况请参考接口 :doc:`文件 `。 sysfs接口 ========= @@ -148,7 +148,7 @@ contexts//monitoring_attrs/ 在 ``nr_regions`` 目录下,有两个文件分别用于DAMON监测区域的下限和上限(``min`` 和 ``max`` ), 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。 -关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/vm/damon/design`)。 +关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/mm/damon/design`)。 contexts//targets/ --------------------- @@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, ---- 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/vm/damon/design` 。例如, +以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: # cd /damon diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 26d9913fc8b6..b03020c8b2ab 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -101,7 +101,7 @@ Todolist: ======== 如何在内核中分配和使用内存。请注意,在 -:doc:`/vm/index` 中有更多的内存管理文档。 +:doc:`/mm/index` 中有更多的内存管理文档。 .. toctree:: :maxdepth: 1 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index ad7bb8c17562..bf85baca8b3e 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -118,7 +118,7 @@ TODOList: sound/index filesystems/index scheduler/index - vm/index + mm/index peci/index TODOList: diff --git a/Documentation/translations/zh_CN/mm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst new file mode 100644 index 000000000000..c2816f523bd7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/active_mm.rst @@ -0,0 +1,85 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/active_mm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========= +Active MM +========= + +这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, + 且更多的人在阅读它们时,我觉得棒极了。 + + 1999年7月30日 星期五, David Mosberger 写道: + > + > 是否有一个简短的描述,说明task_struct中的 + > "mm" 和 "active_mm"应该如何使用? (如果 + > 这个问题在邮件列表中讨论过,我表示歉意--我刚 + > 刚度假回来,有一段时间没能关注linux-kernel了)。 + + 基本上,新的设定是: + + - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 + 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 + 处于活动状态。 + + 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 + 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 + 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 + 式的bdflush sync能做到这一点。 + + - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, + 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 + + - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 + 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 + + 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm + 显然必须与真实的mm相同。 + + 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 + “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 + + 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” + 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” + 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 + + 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 + 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** + 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 + 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 + + 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 + “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 + 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 + + if (current->mm == &init_mm) + + 一般来说,应该用 + + if (!current->mm) + + 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 + 由缺页异常处理程序和类似的东西来完成)。 + + 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 + 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 + 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 + 连接起来,你需要同时切换两者)。 + + (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst new file mode 100644 index 000000000000..6fd79209c307 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/balance.rst @@ -0,0 +1,81 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/balance.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +内存平衡 +======== + +2000年1月开始,作者:Kanoj Sarcar + +对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 +内存平衡。 + +调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 +原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 +选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 + +__GFP_IO分配请求是为了防止文件系统死锁。 + +在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 +说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 + +也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, +这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 +和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 +常规的内存请求,而不是产生常规区域平衡的开销。 + +在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma +和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 +大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 +存在,这个问题变得更加严重。 + +在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), +我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 +候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 +而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 +个简单的等式。 + +另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 +低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 +的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 +内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 + +请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 +空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 + +所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd +被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() +一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM +页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 + +kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 +是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, +kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone +结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 + +如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 +已经低于其水位,则会进行偷取。 + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: +这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, +hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 +[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 +请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 +hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, +zone_wake_kswapd也被设置。 + + +我所听到的(超棒的)想法: + +1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 + +2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 + (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/mm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst new file mode 100644 index 000000000000..5593a83c86bc --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/api.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/api.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======= +API参考 +======= + +内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , +它位于源代码树的include/linux/。 + +结构体 +====== + +该API在以下内核代码中: + +include/linux/damon.h + + +函数 +==== + +该API在以下内核代码中: + +mm/damon/core.c diff --git a/Documentation/translations/zh_CN/mm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst new file mode 100644 index 000000000000..16e3db34a7dd --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/design.rst @@ -0,0 +1,140 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/design.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +==== +设计 +==== + +可配置的层 +========== + +DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 +并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON +将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 + +由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 +任何地址空间。如果没有提供合适的,用户可以自己实现基元。 + +例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 +另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 + + +特定地址空间基元的参考实现 +========================== + +基本访问监测的低级基元被定义为两部分。: + +1. 确定地址空间的监测目标地址范围 +2. 目标空间中特定地址范围的访问检查。 + +DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 + + +基于VMA的目标地址范围构造 +------------------------- + +这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 + +在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 +址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 +格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 +内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 + +出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 +区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 +的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 +在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +基于PTE访问位的访问检查 +----------------------- + +物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 +找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 +是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, +并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 +踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 +``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 + + +独立于地址空间的核心机制 +======================== + +下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 +``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 + + +访问频率监测 +------------ + +DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 +``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 +个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 +去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 +果。这可以用以下简单的伪代码来描述:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 + + +基于区域的抽样调查 +------------------ + +为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 +这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 +于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 +查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 +区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 + +然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 + + +适应性区域调整 +-------------- + +即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 +据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 +区域的访问频率自适应地进行合并和拆分。 + +对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 +然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, +它将每个区域拆分为两个或三个区域。 + +通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 + + +动态目标空间更新处理 +-------------------- + +监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 +热插拔。 + +由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, +并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 +数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/mm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst new file mode 100644 index 000000000000..de4be417494a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/faq.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/faq.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +常见问题 +======== + +为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? +========================================================== + +首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 +空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 +(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 + + +“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? +============================================== + +闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 +使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 +存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 +DAMON输出的一个子集,但不能替代DAMON。 + + +DAMON是否只支持虚拟内存? +========================= + +不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 +部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 +术来监测任何地址空间。 + +尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 +相关功能的实现,以供参考和方便使用。 + + +我可以简单地监测页面的粒度吗? +============================== + +是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 +因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/mm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst new file mode 100644 index 000000000000..b03bf307204f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/index.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========================== +DAMON:数据访问监视器 +========================== + +DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 +(该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)) + + - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), + - *轻量级* (监控开销低到可以在线应用),以及 + - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 + +因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 +验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 +个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/translations/zh_CN/mm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst new file mode 100644 index 000000000000..83b14cce9adf --- /dev/null +++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst @@ -0,0 +1,38 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +空闲页报告 +========== + +空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 +化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 +面。 + +对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info +结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 +须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 +假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 +架注册页面报告接口。 + +一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 +页面,并在任何足够高的页面被释放之后2秒继续报告。 + +报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 +nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 +页将被返回到它们所获得的自由区域。 + +在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 +目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 +告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 +空闲页方面的工作。 + + +Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst new file mode 100644 index 000000000000..5c18ea2be04f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/frontswap.rst @@ -0,0 +1,196 @@ +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========= +Frontswap +========= + +Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 +于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 +节省(提高)。 + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 +储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory +(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 +求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 +调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops +的功能,它提供的功能必须符合某些策略,如下所示: + +一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap +交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 +量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 +内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 +transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 +相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 + +一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 +要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 +经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, +也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 +可以像往常一样被写入交换空间。 + +请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” +的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 +不会从frontswap中获得。 + +如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 +debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): + +``failed_stores`` + 有多少次存储的尝试是失败的 + +``loads`` + 尝试了多少次加载(应该全部成功) + +``succ_stores`` + 有多少次存储的尝试是成功的 + +``invalidates`` + 尝试了多少次作废 + +后台实现可以提供额外的指标。 + +经常问到的问题 +============== + +* 价值在哪里? + +当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 +读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 +能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 +移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 +页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 + +Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 +灵活性: + +在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 +全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 +用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 +工作负载上,则有明显的性能改善(25%以上)。 + +“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory +的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 +统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 +交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 +可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 +有多少内存可用 + +在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 +用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 +是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory +后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, +而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 +的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap +允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), +从而减轻计划外交换可能带来的可怕的性能影响。 + +一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 +内存扩展技术的调查也在进行中。 + +* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? + +如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 +个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 +frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 +是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 +后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 +忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 +的,无论如何使用一小部分的 CPU 都是不相关的。 + +至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 +每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 +页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 +会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 +非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 + +当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 +产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 +来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 + +* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? + +我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 +明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 +供了多少内存是完全动态和随机的。 + +每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 +型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 +换页的尝试。 + +每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 +frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 +间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap +backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 +页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 +据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 +下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 +页面偏移量,否则它就会将数据写入该设备。 + +当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), +检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 +的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, +以便从真正的交换设备上获得这一页的数据。 + +所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 +入都被 “frontswap backend store” 和(可能)“frontswap backend loads” +所取代,这可能会快得多。 + +* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 + 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? + +首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 +结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 +假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 +及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 + +例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend +的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 +先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 +以根据当前的内存限制动态地定义。 + +此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 +块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 +不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 +态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 +复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 +的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” +线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, +KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 + +在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 +例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 +自己的交换。 + +transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 +能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, +frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 +容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 +swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap +就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 +backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 + + +* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, + 难道它不能总是被成功地覆盖吗? + +几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 +缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 +backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap +拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 +换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 + +* 为什么frontswap补丁会创建新的头文件swapfile.h? + +frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 +在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 +个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 + +Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/mm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst new file mode 100644 index 000000000000..81202c65e000 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/highmem.rst @@ -0,0 +1,128 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/highmem.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +高内存处理 +========== + +作者: Peter Zijlstra + +.. contents:: :local: + +高内存是什么? +============== + +当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 +核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 +存的临时映射。 + +没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 +各种架构上的限制。 + +例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 +出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 +间进行划分。 + +使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 +情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 + +其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 +用mm上下文标签时,其虚拟空间有限。 + + +临时虚拟映射 +============ + +内核包含几种创建临时映射的方法。: + +* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization + 来解除映射。 + +* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 + 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 + +* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, + 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 + + kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() + 之后才睡眠。 + + 可以假设k[un]map_atomic()不会失败。 + + +使用kmap_atomic +=============== + +何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) +分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 +下面类似:: + + /* 找到感兴趣的页面。 */ + struct page *page = find_get_page(mapping, offset); + + /* 获得对该页内容的访问权。 */ + void *vaddr = kmap_atomic(page); + + /* 对该页的内容做一些处理。 */ + memset(vaddr, 0, PAGE_SIZE); + + /* 解除该页面的映射。 */ + kunmap_atomic(vaddr); + +注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 + +如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 +格嵌套,如:: + + vaddr1 = kmap_atomic(page1); + vaddr2 = kmap_atomic(page2); + + memcpy(vaddr1, vaddr2, PAGE_SIZE); + + kunmap_atomic(vaddr2); + kunmap_atomic(vaddr1); + + +临时映射的成本 +============== + +创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 + +如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 +指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 + +如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 + + +i386 PAE +======== + +在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: + +* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: + +* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 + 112G的页;然而,内核需要在内存中存储更多的页帧...... + +* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 + 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 + +一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 +量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst new file mode 100644 index 000000000000..5024a8a15516 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hmm.rst @@ -0,0 +1,361 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hmm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================== +异构内存管理 (HMM) +================== + +提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 +基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 + +HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 +地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 +必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 + +本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 +部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 +表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, +最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 + +.. contents:: :local: + +使用特定于设备的内存分配器的问题 +================================ + +具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 +造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) +之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: +即,设备可以透明地使用任何应用程序内存区域。 + +分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 +看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 + +具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap +私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, +但是是设备文件)。 + +对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), +很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, +而且由于数据集和地址的重复,程序更难调试。 + +分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 +不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 +拷贝而浪费资源。 + +复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 +这将导致库入口点的组合爆炸。 + +最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 +序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 +其他模式,使用共享地址空间也更合理。 + + +I/O 总线、设备内存特性 +====================== + +由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 +内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 +不是缓存一致的。 + +如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 +致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU +只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 +能被视为与常规内存等同。 + +另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU +内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 +己的内存时高一个数量级。 + +一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 +(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 +有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 +件解决方案。 + +因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 +存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 + + +共享地址空间和迁移 +================== + +HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 +于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 + +为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 +更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 +缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 +过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 +动程序的同时,把一切可以考虑的因素都考虑进去了。 + +HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 +struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 +制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 +struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, +首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 +给设备驱动程序。 + +请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU +地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 +并启动向主内存的迁移。 + +凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 +过迁移设备正在使用的数据集部分来利用设备内存。 + + +地址空间镜像实现和API +===================== + +地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 +保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier +开始:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 +读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 + +当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: + + int hmm_range_fault(struct hmm_range *range); + +如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 +页异常代码路径,就像 CPU 缺页异常一样。 + +这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 +范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 + +在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 +持事物正确同步。使用模式是:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 +mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 + +利用 default_flags 和 pfn_flags_mask +==================================== + +hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 +的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 + +例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 + +现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 +置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 +== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 +CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 + +hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 +面可写,将设置 HMM_PFN_WRITE。 + + +从核心内核的角度表示和管理设备内存 +================================== + +尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 +的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 +事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 +能理解这种新的内存类型。 + +大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, +HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 +我们只需要确保没有人试图从 CPU 端映射这些页面。 + +移入和移出设备内存 +================== + +由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 +储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 +migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 + +在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 +作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 + +这些可以通过以下方式分配和释放:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +还有devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. + +整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , +但这些步骤分为设备驱动程序特定代码和共享公共代码: + +1. ``mmap_read_lock()`` + + 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), + 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 + migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 + 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 + 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 + 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA + 范围可以被迁移到系统内存和设备私有内存。 + + migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` + 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 + 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 + ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , + 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 + migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 + 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 + + + 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 + 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 + 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 + ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 + 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() + 还清除了 ``args->dst`` 数组。 + +3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 + + 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 + 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 + 择跳过页面迁移。 + + 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` + 锁定该页,并将 ``dst`` 数组条目填入:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 + 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 + 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 + + 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 + ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 + 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 + +4. ``migrate_vma_pages()`` + + 这一步是实际“提交”迁移的地方。 + + 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 + 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 + 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 + ``MIGRATE_PFN_MIGRATE`` 位被清除。 + + 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 + ``struct page`` ,最终完成CPU端的迁移。 + +5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 + + 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 + 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 + +6. ``migrate_vma_finalize()`` + + 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` + 的引用。 + +7. ``mmap_read_unlock()`` + + 现在可以释放锁了。 + +独占访问存储器 +============== + +一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 +个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU +的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 +个内存范围不能从用户空间访问。 + +这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 +导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 +经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 +放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 + +内存 cgroup (memcg) 和 rss 统计 +=============================== + +目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, +如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 +这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 +内存,运行不受影响。 + +一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, +因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 +于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 + +对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 +页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 +内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 +内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 + +请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 +总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..752e5696cd47 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -0,0 +1,436 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hugetlbfs_reserv.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +============== +Hugetlbfs 预留 +============== + +概述 +==== + +:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 +示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 +时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 +持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 +没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 +简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 +样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” +巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 +中是如何进行巨页预留处理的。 + + +读者 +==== +这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 + + +数据结构 +======== + +resv_huge_pages + 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 + 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 +Reserve Map + 预留映射由以下结构体描述:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 + 区域。一个区域被描述为:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 + reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 +Flags for MAP_PRIVATE Reservations + 这些被存储在预留的映射指针的底部。 + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + 表示该任务是与该映射相关的预留的所有者。 + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 + 射了一个页面。 +Page Flags + PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 + “释放巨页” 一节中讨论。 + + +预留映射位置(私有或共享) +========================== + +一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 +(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, +预留映射的位置和语义是明显不同的。位置的差异是: + +- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 + 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 +- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 + 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 + 留映射。因此,预留映射在创建节点时被分配。 + + +创建预留 +======== +当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) +创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE +标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 + +参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 +的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' +参数已经被这个偏移量所调整。 + +PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 + +- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 + 修改。 +- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 + 因此,预留映射也可用于确定哪些预留已被消耗。 + +对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, +HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 + +预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 +然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, +请参见 :ref:`预留映射的修改 ` 一节。 + +该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 +有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` +一节。 + +在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 +并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 +些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages +会被调整,如下所示:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 + +如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 +反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, +不对预留映射进行修改,因为没有条目表示存在预留。 + +如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 +在'from'-'to'范围内存在预留。 + +消耗预留/分配一个巨页 +=========================== + +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +中进行的:: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 +外拷贝被分配。 + + +调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 +细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 +vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 +如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, +则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 +确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 +的参数: + +- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, + 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 + 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 + +与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, +free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: + + SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, + * 如果遇到错误,以至于必须释放这个页面,预留将被 + * 恢复。 */ + resv_huge_pages--; /* 减少全局预留计数 */ + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: +SetPagePrivate(page) 和 resv_huge_pages--. + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 +个新的条目。 + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 +SetPagePrivate(page)和resv_huge_pages-。 + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 +一个新的条目。 + +在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 +享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 +这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 +识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 +参见 :ref:`预留映射帮助函数 ` 一节。 + + +实例化巨页 +========== + +在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 +存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, +当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 + + +释放巨页 +======== + +巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 +递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 +留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 + +page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 +应该被调整(关于如何设置这些标志的信息,请参见 +:ref: `消耗预留/分配一个巨页 ` )。 + + +该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 +传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 +因此,在这种情况下,全局resv_huge_pages计数器被递增。 + +如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 + +子池预留 +======== + +有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 +个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 + +当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 +如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 +hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) +被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 + +当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 +hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 +(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 +一个错误。 + +然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 +池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 +满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 +者要负责从全局池中获取另外两个页面。 + + +COW和预留 +========== + +由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 +以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 +自己的页。 + +当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 +页,分配会失败。 + +当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 +由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 +时,对预留的所有者和非所有者采取不同的行动。 + +在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 + +如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 +从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED +位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 +会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 + +预留映射的修改 +============== + +以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 +助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 +记录。这些函数是:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +在预留映射上的操作通常涉及两个操作: + +1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 + + 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 + +2) + a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 + [f, t]的预留映射。 + b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 + +注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 +成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 +成功。 + +如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 +到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 +同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 +行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 +和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 + +函数region_del()被调用以从预留映射中移除区域。 +它通常在以下情况下被调用: + +- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 + 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 +- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 + 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del + 的范围是[new_end_of_file, LONG_MAX]。 +- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 + 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 + 围是[page_idx, page_idx + 1]。 + +在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() +会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 +结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 +预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() +来调整计数器,使其与不能被删除的预留映射条目相对应。 + +region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 +因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 +(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 +失,子池和全局预留计数被未完成的预留数量所减去。 + +预留映射帮助函数 +================ + +有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 +地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 +映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, +它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 +先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 +返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 +现意外的差异,说明在两次调用之间修改了预留映射:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 +先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 +中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 +了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 +示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 +要做什么的更多信息,请参见 “错误路径中的预留清理” 。 + + +错误路径中的预留清理 +==================== + +正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 +先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 +如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, +一切都很好。 + +此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 +正确的。 + +然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, +页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 +(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 +显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, +并阻止分配一个预先分配的页面。 + +函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 +是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 +数相对应。 + +函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, +它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 +然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 +初设想的那样使用一个预留页。 + +有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, +它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 + + +预留和内存策略 +============== +当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 +在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 +完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 +用:: + + + /* + * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 + * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 + * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 + * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 + * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 + * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 + * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 + */ + +添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 +程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 +预留,也是如此。 + +Hugetlbfs回归测试 +================= + +最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 +libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 +libhugetlbfs中添加适当的测试。 + +-- +Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/mm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst new file mode 100644 index 000000000000..310862edc937 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hwpoison.rst @@ -0,0 +1,166 @@ + +:Original: Documentation/mm/hwpoison.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +hwpoison +======== + +什么是hwpoison? +=============== + + +即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 +一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 + +这个补丁包在虚拟机中实现了必要的(编程)框架。 + +引用概述中的评论:: + + 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 + 存或高速缓存故障。 + + 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 + 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 + 以安全地忽略它. 而不是用另外一个机器检查去处理它。 + + 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 + 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 + 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 + 这意味着错误处理可能需要很长的时间。 + + 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 + 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 + 以我们希望我们可以摆脱这种情况。 + +该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 +各种检查组成,用来处理poison的页面。 + +现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm +版本。 + +对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 +机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 +样做,但一些非常专业的应用程序可能会这样做。 + +故障恢复模式 +============ + +有两种(实际上是三种)模式的内存故障恢复可以在。 + +vm.memory_failure_recovery sysctl 置零: + 所有的内存故障都会导致panic。请不要尝试恢复。 + +早期处理 + (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 + 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 + 模式。 + +推迟处理 + 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 + 最好的,默认情况下注意一些页面总是被当作late kill处理。 + +用户控制 +======== + +vm.memory_failure_recovery + 参阅 sysctl.txt + +vm.memory_failure_early_kill + 全局启用early kill + +PR_MCE_KILL + 设置early/late kill mode/revert 到系统默认值。 + + arg1: PR_MCE_KILL_CLEAR: + 恢复到系统默认值 + arg1: PR_MCE_KILL_SET: + arg2定义了线程特定模式 + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + 使用系统全局默认值 + + 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 + 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 + +PR_MCE_KILL_GET + 返回当前模式 + +测试 +==== + +* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 + +* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 + + corrupt-pfn + 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 + 免在测试套件中损坏非预期页面。 + unpoison-pfn + 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 + 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 + + 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 + + corrupt-filter-dev-major, corrupt-filter-dev-minor + 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 + 配符值。这应该只用于人工注入的测试。 + + corrupt-filter-memcg + 限制注入到memgroup拥有的页面。由memcg的inode号指定。 + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 + 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 + 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 + Documentation/admin-guide/mm/pagemap.rst中记录。 + +* 架构特定的MCE注入器 + + x86 有 mce-inject, mce-test + + 在mce-test中的一些便携式hwpoison测试程序,见下文。 + +引用 +==== + +http://halobates.de/mce-lc09-2.pdf + 09年LinuxCon的概述演讲 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + 测试套件(在tsrc中的hwpoison特定可移植测试)。 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86特定的注入器 + + +限制 +==== +- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 + 复,目前只有LRU页。 + +--- +Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst new file mode 100644 index 000000000000..4c8c6b7b72a3 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -0,0 +1,54 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================= +Linux内存管理文档 +================= + +这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 +和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 +议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 +对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 +TODO:待引用文档集被翻译完毕后请及时修改此处) + +.. toctree:: + :maxdepth: 1 + + active_mm + balance + damon/index + free_page_reporting + highmem + ksm + frontswap + hmm + hwpoison + hugetlbfs_reserv + memory-model + mmu_notifier + numa + overcommit-accounting + page_frags + page_owner + page_table_check + remap_file_pages + split_page_table_lock + z3fold + zsmalloc + +TODOLIST: +* arch_pgtable_helpers +* free_page_reporting +* hugetlbfs_reserv +* page_migration +* slub +* transhuge +* unevictable-lru +* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst new file mode 100644 index 000000000000..d1f82e857ad7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/ksm.rst @@ -0,0 +1,70 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/ksm.rst + +:翻译: + + 徐鑫 xu xin + +============ +内核同页合并 +============ + +KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 +到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 +https://lwn.net/Articles/330589 + +KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst +文档中有描述。 + +设计 +==== + +概述 +---- + +概述内容请见mm/ksm.c文档中的“DOC: Overview” + +逆映射 +------ +KSM维护着稳定树中的KSM页的逆映射信息。 + +当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 +KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 +的 ``page->mapping`` 指向了该稳定树节点。 + +如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 +个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` +指向该"副本"。 + +每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 +内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 + +这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 +本身中不能有重复的KSM页面内容仍然是强制要求。 + +由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 +rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 +虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 +因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 +程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 +的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 +保留在"副本"列表的开头。 + +``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 +树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 +缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 + +``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 +的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 +法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 +那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 +能会减慢应用程序在KSM页面上的只读计算。 + +KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 +这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 + +参考 +==== +内核代码请见mm/ksm.c。 +涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst new file mode 100644 index 000000000000..77ec149a970c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/memory-model.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============ +物理内存模型 +============ + +系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 +始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 +小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 +同的内存库连接到不同的CPU。 + +Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 +个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 +覆盖该默认值。 + +所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 +帧的状态。 + +无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 +在一对一的映射关系。 + +每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` +帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 + +FLATMEM +======= + +最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 +物理内存。 + +在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 +于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` +对象从未被完全初始化。 + +为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 +然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 +将所有的内存交给页分配器。 + +一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 +定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 + +使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` +是 `mem_map` 数组的一个索引。 + +`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 + +SPARSEMEM +========= + +SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, +如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 +初始化。 + +SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 +体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` +阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 +和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 +来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` +是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 +意的值。 + +最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 +大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: + +* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 + `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 +* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 + 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 + 内存区。 + +架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 + +通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 + "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 + 值决定。 + +Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 +框的段。在一个区段内,PFN是指向页数组的索引。 + +Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 +作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` +对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 + +为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 +确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, +它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, +它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 + +虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 +配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 +vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 +:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 + +ZONE_DEVICE +=========== +`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 +围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 +事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 +是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , +为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, +:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 +用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` +空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 + +虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 +`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` +内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 +上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 +:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 +:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 + +`ZONE_DEVICE` 的用户是: + +* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 + +* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , + 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见Documentation/mm/hmm.rst。 + +* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 + 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/mm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst new file mode 100644 index 000000000000..ce3664d1a410 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst @@ -0,0 +1,97 @@ +:Original: Documentation/mm/mmu_notifier.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +什么时候需要页表锁内通知? +========================== + +当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 +mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 + +对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 +IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 +表锁的同时通知这些二级TLB: + + A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 + B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 + +情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 + +情况B更加微妙。为了正确起见,它需要按照以下序列发生: + + - 上页表锁 + - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) + - 设置页表项以指向新页 + +如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 +C++11等内存模型。 + +考虑以下情况(设备使用类似于ATS/PASID的功能)。 + +两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 +写保护(B的其他情况也适用)。 + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {尝试写到addrA} + CPU-thread-1 {尝试写到addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {读取addrA并填充设备TLB} + DEV-thread-2 {读取addrB并填充设备TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} + CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {写入addrA,这是对新页面的写入} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {写入addrB,这是一个写入新页的过程} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {从旧页中读取addrA} + DEV-thread-2 {从新页面读取addrB} + +所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 +就看到了addrB的新值。这就破坏了设备的总内存序。 + +当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range +调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 +在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/mm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst new file mode 100644 index 000000000000..b15cfeeb6dfb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/numa.rst @@ -0,0 +1,101 @@ +:Original: Documentation/mm/numa.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +始于1999年11月,作者: + +========================== +何为非统一内存访问(NUMA)? +========================== + +这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 + +从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 +本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 +本文中称这些组件/装配为“单元”。 + +每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 +不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 +链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 +他单元有多个距离。 + +对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, +所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 +系统互连在硬件中处理。 + +内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 +有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 +更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 + +平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 +内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 +[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 + +这就自然而然有了Linux软件对NUMA系统的视角: + +Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 +上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO +总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 +的访问时间和更高的有效带宽。 + +对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 +的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 +的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 + +此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 +有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 +分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, +可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 +使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE +中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 +时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 +“fallback 回退”。 + +由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 +节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, +代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 +离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 + +默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 +图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 +满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 + +本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 +一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 +中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 +远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 +以在节点之间迁移,远离其初始节点和内核数据结构。 + +系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 +序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 +Linux NUMA内存策略修改内核的默认本地分配行为。 [见 +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 +的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 +内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 +将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 +最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 +其他附近的节点。 + +一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 +得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 + +一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 +节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 +路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 +内核分析子系统就是这样的一个例子。 + +如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 +子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 +以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 +一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/mm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..d8452d8b7fbb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst @@ -0,0 +1,86 @@ +:Original: Documentation/mm/overcommit-accounting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +============== +超量使用审计 +============== + +Linux内核支持下列超量使用处理模式 + +0 + 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 + 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, + 允许root分配稍多的内存。这是默认的。 +1 + 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 + 几乎完全由零页组成的虚拟内存 + +2 + 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 + (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 + 不会被杀死,但会在内存分配上收到相应的错误。 + + 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 + 是很有用的。 + +超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 + +可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) +来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 + +在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 +的超量使用和提交量。 + +陷阱 +==== + +C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, +你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 +不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 + + +在模式2中,MAP_NORESERVE标志被忽略。 + + +它是如何工作的 +============== + +超量使用是基于以下规则 + +对于文件映射 + | SHARED or READ-only - 0 cost (该文件是映射而不是交换) + | PRIVATE WRITABLE - 每个实例的映射大小 + +对于匿名或者 ``/dev/zero`` 映射 + | SHARED - 映射的大小 + | PRIVATE READ-only - 0 cost (但作用不大) + | PRIVATE WRITABLE - 每个实例的映射大小 + +额外的计数 + | 通过mmap制作可写副本的页面 + | 从同一池中提取的shmfs内存 + +状态 +==== + +* 我们核算mmap内存映射 +* 我们核算mprotect在提交中的变化 +* 我们核算mremap的大小变化 +* 我们的审计 brk +* 审计munmap +* 我们在/proc中报告commit 状态 +* 核对并检查分叉的情况 +* 审查堆栈处理/执行中的构建 +* 叙述SHMfs的情况 +* 实现实际限制的执行 + +待续 +==== +* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst new file mode 100644 index 000000000000..320952ca93af --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -0,0 +1,38 @@ +:Original: Documentation/mm/page_frag.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页面片段 +======== + +一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 +该页中的多个碎片在该页的引用计数器中被单独计算。 + +page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 +的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 +sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 + +为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, +并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, +这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 +个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 + +网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 +netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache +被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 +它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 +将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 + +许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 +缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, +__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 +这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 +get_page。 + +Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst new file mode 100644 index 000000000000..03d9e613094a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -0,0 +1,116 @@ +:Original: Documentation/mm/page_owner.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================ +page owner: 跟踪谁分配的每个页面 +================================ + +概述 +==== + +page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 +当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 +当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 + +尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 +相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 +动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 +可能性,而不是仅仅保留在内存中,所以不利于调试。 + +页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 +统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 + +page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline +中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 +选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 +需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 +热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 +一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 +功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 + +- 没有page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- 有page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath +中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 + +有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 +个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 +之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 +页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 +少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 +个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 +之后,没有任何页面处于未追踪状态。 + +使用方法 +======== + +1) 构建用户空间的帮助:: + + cd tools/vm + make page_owner_sort + +2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. + +3) 做你想调试的工作。 + +4) 分析来自页面所有者的信息:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 + 取页序值,计算buf的次数和页数,最后根据参数进行排序。 + + 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 + 按buf的页数排序,请使用-m参数。详细的参数是: + + 基本函数: + + Sort: + -a 按内存分配时间排序 + -m 按总内存排序 + -p 按pid排序。 + -P 按tgid排序。 + -r 按内存释放时间排序。 + -s 按堆栈跟踪排序。 + -t 按时间排序(默认)。 + + 其它函数: + + Cull: + -c 通过比较堆栈跟踪而不是总块来进行剔除。 + + Filter: + -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/mm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst new file mode 100644 index 000000000000..e8077310a76c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/page_table_check.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页表检查 +======== + +概述 +==== + +页表检查允许通过确保防止某些类型的内存损坏来强化内核。 + +当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 +的验证。 + +在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 +的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 +试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 + +双重映射检测逻辑 +================ + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +启用页表检查 +============ + +用以下方法构建内核: + +- PAGE_TABLE_CHECK=y + 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 + +- 使用 "page_table_check=on" 内核参数启动。 + +可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 +支持。 diff --git a/Documentation/translations/zh_CN/mm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst new file mode 100644 index 000000000000..31e0c54dc36f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst @@ -0,0 +1,32 @@ +:Original: Documentation/mm/remap_file_pages.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============================== +remap_file_pages()系统调用 +============================== + +remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, +文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 +处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 + +支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 +路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 +偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 +源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 + +幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS +实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 +由于64位系统的广泛使用,这种使用情况已经不重要了。 + +syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 +对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 + +仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 +vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT +的注释。 diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..4fb7aa666037 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst @@ -0,0 +1,96 @@ +:Original: Documentation/mm/split_page_table_lock.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================= +分页表锁(split page table lock) +================================= + +最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 +法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 +展性,我们引入了分页表锁。 + +有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 +PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 + +有一些辅助工具来锁定/解锁一个表和其他访问器函数: + + - pte_offset_map_lock() + 映射pte并获取PTE表锁,返回所取锁的指针; + - pte_unmap_unlock() + 解锁和解映射PTE表; + - pte_alloc_map_lock() + 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 + 或NULL; + - pte_lockptr() + 返回指向PTE表锁的指针; + - pmd_lock() + 取得PMD表锁,返回所取锁的指针。 + - pmd_lockptr() + 返回指向PMD表锁的指针; + +如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 +时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock +来保护。 + +如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 +下文)。 + +Hugetlb 和分页表锁 +================== + +Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 + +Hugetlb特定的辅助函数: + + - huge_pte_lock() + 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; + - huge_pte_lockptr() + 返回指向表锁的指针。 + +架构对分页表锁的支持 +==================== + +没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() +和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 + +确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 +面。这个区域与page->ptl共享存储。 + +PMD分页锁只有在你有两个以上的页表级别时才有意义。 + +启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 +用pgtable_pmd_page_dtor()。 + +分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() +中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 +分配一些PMD。 + +一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 + +注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 +须正确处理。 + +page->ptl +========= + +page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 +与page->private(以及union中的其他几个字段)共享存储。 + +为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: + + - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 + 就可以避免间接访问并节省一个缓存行。 + - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t + 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 + 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 + +PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t +分配在pgtable_pmd_page_ctor()中。 + +请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/mm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst new file mode 100644 index 000000000000..9569a6d88270 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/z3fold.rst @@ -0,0 +1,31 @@ +:Original: Documentation/mm/z3fold.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +====== +z3fold +====== + +z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 +它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 + +z3fold和zbud的主要区别是: + +* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 +* z3fold在其页面中最多可以容纳3个压缩页面 +* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 + +为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 +它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 +率是1.7倍左右。 + +不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 +返回一个无符号长句柄,它编码了被分配对象的实际位置。 + +保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 +为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/mm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst new file mode 100644 index 000000000000..b5596ea08ae4 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst @@ -0,0 +1,78 @@ +:Original: Documentation/mm/zs_malloc.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +======== +zsmalloc +======== + +这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, +它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 +是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 +占据整个页面。这是其前身(xvmalloc)的主要问题之一。 + +为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 +们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order +页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 + +为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 +要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 +个大小的分配请求,会返回失败(见zs_malloc)。 + +此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 +长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 +映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 +的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 +zs_unmap_object()解除映射。 + +stat +==== + +通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` +看到zsmalloc内部信息。下面是一个统计输出的例子。:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + 索引 +size + zspage存储对象大小 +almost_empty + ZS_ALMOST_EMPTY zspage的数量(见下文)。 +almost_full + ZS_ALMOST_FULL zspage的数量(见下图) +obj_allocated + 已分配对象的数量 +obj_used + 分配给用户的对象的数量 +pages_used + 为该类分配的页数 +pages_per_zspage + 组成一个zspage的0-order页面的数量 + +当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 + +* n = 已分配对象的数量 +* N = zspage可以存储的对象总数 +* f = fullness_threshold_frac(即,目前是4个) + +同样地,我们将zspage分配给: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/vm/active_mm.rst deleted file mode 100644 index 366609ea4f37..000000000000 --- a/Documentation/translations/zh_CN/vm/active_mm.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/active_mm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========= -Active MM -========= - -这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, - 且更多的人在阅读它们时,我觉得棒极了。 - - 1999年7月30日 星期五, David Mosberger 写道: - > - > 是否有一个简短的描述,说明task_struct中的 - > "mm" 和 "active_mm"应该如何使用? (如果 - > 这个问题在邮件列表中讨论过,我表示歉意--我刚 - > 刚度假回来,有一段时间没能关注linux-kernel了)。 - - 基本上,新的设定是: - - - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 - 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 - 处于活动状态。 - - 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 - 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 - 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 - 式的bdflush sync能做到这一点。 - - - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, - 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 - - - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 - 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 - - 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm - 显然必须与真实的mm相同。 - - 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 - “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 - - 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” - 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” - 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 - - 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 - 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** - 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 - 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 - - 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 - “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 - 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 - - if (current->mm == &init_mm) - - 一般来说,应该用 - - if (!current->mm) - - 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 - 由缺页异常处理程序和类似的东西来完成)。 - - 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 - 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 - 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 - 连接起来,你需要同时切换两者)。 - - (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/vm/balance.rst deleted file mode 100644 index e98a47ef24a8..000000000000 --- a/Documentation/translations/zh_CN/vm/balance.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/balance.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -内存平衡 -======== - -2000年1月开始,作者:Kanoj Sarcar - -对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 -内存平衡。 - -调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 -原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 -选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 - -__GFP_IO分配请求是为了防止文件系统死锁。 - -在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 -说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 - -也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, -这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 -和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 -常规的内存请求,而不是产生常规区域平衡的开销。 - -在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma -和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 -大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 -存在,这个问题变得更加严重。 - -在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), -我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 -候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 -而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 -个简单的等式。 - -另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 -低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 -的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 -内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 - -请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 -空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 - -所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd -被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() -一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM -页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 - -kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 -是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, -kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone -结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 - -如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 -已经低于其水位,则会进行偷取。 - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: -这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, -hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 -[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 -请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 -hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, -zone_wake_kswapd也被设置。 - - -我所听到的(超棒的)想法: - -1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 - -2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 - (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/vm/damon/api.rst deleted file mode 100644 index 21143eea4ebe..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/api.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/api.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======= -API参考 -======= - -内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , -它位于源代码树的include/linux/。 - -结构体 -====== - -该API在以下内核代码中: - -include/linux/damon.h - - -函数 -==== - -该API在以下内核代码中: - -mm/damon/core.c diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst deleted file mode 100644 index 46128b77c2b3..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/design.rst +++ /dev/null @@ -1,140 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/design.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -==== -设计 -==== - -可配置的层 -========== - -DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 -并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON -将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 - -由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 -任何地址空间。如果没有提供合适的,用户可以自己实现基元。 - -例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 -另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 - - -特定地址空间基元的参考实现 -========================== - -基本访问监测的低级基元被定义为两部分。: - -1. 确定地址空间的监测目标地址范围 -2. 目标空间中特定地址范围的访问检查。 - -DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 - - -基于VMA的目标地址范围构造 -------------------------- - -这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 - -在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 -址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 -格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 -内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 - -出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 -区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 -的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 -在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -基于PTE访问位的访问检查 ------------------------ - -物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 -找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 -是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, -并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 -踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 -``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 - - -独立于地址空间的核心机制 -======================== - -下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 -``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 - - -访问频率监测 ------------- - -DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 -``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 -个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 -去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 -果。这可以用以下简单的伪代码来描述:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 - - -基于区域的抽样调查 ------------------- - -为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 -这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 -于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 -查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 -区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 - -然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 - - -适应性区域调整 --------------- - -即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 -据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 -区域的访问频率自适应地进行合并和拆分。 - -对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 -然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, -它将每个区域拆分为两个或三个区域。 - -通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 - - -动态目标空间更新处理 --------------------- - -监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 -热插拔。 - -由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, -并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 -数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/vm/damon/faq.rst deleted file mode 100644 index 07b4ac19407d..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/faq.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/faq.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -常见问题 -======== - -为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? -========================================================== - -首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 -空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 -(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 - - -“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? -============================================== - -闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 -使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 -存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 -DAMON输出的一个子集,但不能替代DAMON。 - - -DAMON是否只支持虚拟内存? -========================= - -不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 -部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 -术来监测任何地址空间。 - -尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 -相关功能的实现,以供参考和方便使用。 - - -我可以简单地监测页面的粒度吗? -============================== - -是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 -因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/vm/damon/index.rst deleted file mode 100644 index 84d36d90c9b0..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/index.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========================== -DAMON:数据访问监视器 -========================== - -DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 -(该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)) - - - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), - - *轻量级* (监控开销低到可以在线应用),以及 - - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 - -因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 -验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 -个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 - -.. toctree:: - :maxdepth: 2 - - faq - design - api - diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/vm/free_page_reporting.rst deleted file mode 100644 index 31d6c34b956b..000000000000 --- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -空闲页报告 -========== - -空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 -化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 -面。 - -对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info -结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 -须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 -假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 -架注册页面报告接口。 - -一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 -页面,并在任何足够高的页面被释放之后2秒继续报告。 - -报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 -nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 -页将被返回到它们所获得的自由区域。 - -在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 -目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 -告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 -空闲页方面的工作。 - - -Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst deleted file mode 100644 index 3eb07870e2ef..000000000000 --- a/Documentation/translations/zh_CN/vm/frontswap.rst +++ /dev/null @@ -1,196 +0,0 @@ -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========= -Frontswap -========= - -Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 -于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 -节省(提高)。 - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 -储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory -(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 -求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 -调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops -的功能,它提供的功能必须符合某些策略,如下所示: - -一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap -交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 -量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 -内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 -transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 -相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 - -一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 -要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 -经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, -也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 -可以像往常一样被写入交换空间。 - -请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” -的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 -不会从frontswap中获得。 - -如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 -debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): - -``failed_stores`` - 有多少次存储的尝试是失败的 - -``loads`` - 尝试了多少次加载(应该全部成功) - -``succ_stores`` - 有多少次存储的尝试是成功的 - -``invalidates`` - 尝试了多少次作废 - -后台实现可以提供额外的指标。 - -经常问到的问题 -============== - -* 价值在哪里? - -当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 -读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 -能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 -移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 -页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 - -Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 -灵活性: - -在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 -全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 -用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 -工作负载上,则有明显的性能改善(25%以上)。 - -“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory -的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 -统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 -交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 -可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 -有多少内存可用 - -在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 -用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 -是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory -后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, -而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 -的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap -允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), -从而减轻计划外交换可能带来的可怕的性能影响。 - -一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 -内存扩展技术的调查也在进行中。 - -* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? - -如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 -个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 -frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 -是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 -后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 -忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 -的,无论如何使用一小部分的 CPU 都是不相关的。 - -至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 -每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 -页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 -会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 -非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 - -当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 -产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 -来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 - -* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? - -我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 -明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 -供了多少内存是完全动态和随机的。 - -每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 -型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 -换页的尝试。 - -每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 -frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 -间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap -backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 -页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 -据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 -下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 -页面偏移量,否则它就会将数据写入该设备。 - -当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), -检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 -的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, -以便从真正的交换设备上获得这一页的数据。 - -所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 -入都被 “frontswap backend store” 和(可能)“frontswap backend loads” -所取代,这可能会快得多。 - -* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 - 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? - -首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 -结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 -假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 -及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 - -例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend -的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 -先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 -以根据当前的内存限制动态地定义。 - -此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 -块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 -不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 -态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 -复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 -的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” -线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, -KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 - -在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 -例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 -自己的交换。 - -transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 -能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, -frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 -容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 -swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap -就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 -backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 - - -* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, - 难道它不能总是被成功地覆盖吗? - -几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 -缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 -backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap -拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 -换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 - -* 为什么frontswap补丁会创建新的头文件swapfile.h? - -frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 -在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 -个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 - -Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/vm/highmem.rst deleted file mode 100644 index 018838e58c3e..000000000000 --- a/Documentation/translations/zh_CN/vm/highmem.rst +++ /dev/null @@ -1,128 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/highmem.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -高内存处理 -========== - -作者: Peter Zijlstra - -.. contents:: :local: - -高内存是什么? -============== - -当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 -核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 -存的临时映射。 - -没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 -各种架构上的限制。 - -例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 -出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 -间进行划分。 - -使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 -情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 - -其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 -用mm上下文标签时,其虚拟空间有限。 - - -临时虚拟映射 -============ - -内核包含几种创建临时映射的方法。: - -* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization - 来解除映射。 - -* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 - 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 - -* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, - 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 - - kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() - 之后才睡眠。 - - 可以假设k[un]map_atomic()不会失败。 - - -使用kmap_atomic -=============== - -何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) -分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 -下面类似:: - - /* 找到感兴趣的页面。 */ - struct page *page = find_get_page(mapping, offset); - - /* 获得对该页内容的访问权。 */ - void *vaddr = kmap_atomic(page); - - /* 对该页的内容做一些处理。 */ - memset(vaddr, 0, PAGE_SIZE); - - /* 解除该页面的映射。 */ - kunmap_atomic(vaddr); - -注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 - -如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 -格嵌套,如:: - - vaddr1 = kmap_atomic(page1); - vaddr2 = kmap_atomic(page2); - - memcpy(vaddr1, vaddr2, PAGE_SIZE); - - kunmap_atomic(vaddr2); - kunmap_atomic(vaddr1); - - -临时映射的成本 -============== - -创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 - -如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 -指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 - -如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 - - -i386 PAE -======== - -在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: - -* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: - -* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 - 112G的页;然而,内核需要在内存中存储更多的页帧...... - -* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 - 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 - -一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 -量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst deleted file mode 100644 index 2379df95aa58..000000000000 --- a/Documentation/translations/zh_CN/vm/hmm.rst +++ /dev/null @@ -1,361 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hmm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================== -异构内存管理 (HMM) -================== - -提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 -基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 - -HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 -地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 -必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 - -本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 -部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 -表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, -最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 - -.. contents:: :local: - -使用特定于设备的内存分配器的问题 -================================ - -具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 -造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) -之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: -即,设备可以透明地使用任何应用程序内存区域。 - -分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 -看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 - -具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap -私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, -但是是设备文件)。 - -对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), -很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, -而且由于数据集和地址的重复,程序更难调试。 - -分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 -不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 -拷贝而浪费资源。 - -复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 -这将导致库入口点的组合爆炸。 - -最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 -序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 -其他模式,使用共享地址空间也更合理。 - - -I/O 总线、设备内存特性 -====================== - -由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 -内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 -不是缓存一致的。 - -如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 -致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU -只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 -能被视为与常规内存等同。 - -另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU -内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 -己的内存时高一个数量级。 - -一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 -(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 -有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 -件解决方案。 - -因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 -存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 - - -共享地址空间和迁移 -================== - -HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 -于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 - -为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 -更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 -缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 -过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 -动程序的同时,把一切可以考虑的因素都考虑进去了。 - -HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 -struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 -制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 -struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, -首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 -给设备驱动程序。 - -请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU -地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 -并启动向主内存的迁移。 - -凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 -过迁移设备正在使用的数据集部分来利用设备内存。 - - -地址空间镜像实现和API -===================== - -地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 -保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier -开始:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 -读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 - -当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: - - int hmm_range_fault(struct hmm_range *range); - -如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 -页异常代码路径,就像 CPU 缺页异常一样。 - -这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 -范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 - -在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 -持事物正确同步。使用模式是:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 -mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 - -利用 default_flags 和 pfn_flags_mask -==================================== - -hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 -的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 - -例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 - -现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 -置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 -== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 -CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 - -hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 -面可写,将设置 HMM_PFN_WRITE。 - - -从核心内核的角度表示和管理设备内存 -================================== - -尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 -的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 -事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 -能理解这种新的内存类型。 - -大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, -HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 -我们只需要确保没有人试图从 CPU 端映射这些页面。 - -移入和移出设备内存 -================== - -由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 -储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 -migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 - -在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 -作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 - -这些可以通过以下方式分配和释放:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -还有devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. - -整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , -但这些步骤分为设备驱动程序特定代码和共享公共代码: - -1. ``mmap_read_lock()`` - - 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), - 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 - migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 - 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 - 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 - 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA - 范围可以被迁移到系统内存和设备私有内存。 - - migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` - 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 - 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 - ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , - 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 - migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 - 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 - - - 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 - 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 - 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 - ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 - 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() - 还清除了 ``args->dst`` 数组。 - -3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 - - 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 - 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 - 择跳过页面迁移。 - - 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` - 锁定该页,并将 ``dst`` 数组条目填入:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 - 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 - 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 - - 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 - ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 - 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 - -4. ``migrate_vma_pages()`` - - 这一步是实际“提交”迁移的地方。 - - 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 - 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 - 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 - ``MIGRATE_PFN_MIGRATE`` 位被清除。 - - 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 - ``struct page`` ,最终完成CPU端的迁移。 - -5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 - - 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 - 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 - -6. ``migrate_vma_finalize()`` - - 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` - 的引用。 - -7. ``mmap_read_unlock()`` - - 现在可以释放锁了。 - -独占访问存储器 -============== - -一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 -个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU -的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 -个内存范围不能从用户空间访问。 - -这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 -导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 -经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 -放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 - -内存 cgroup (memcg) 和 rss 统计 -=============================== - -目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, -如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 -这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 -内存,运行不受影响。 - -一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, -因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 -于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 - -对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 -页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 -内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 -内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 - -请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 -总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst deleted file mode 100644 index c6d471ce2131..000000000000 --- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,436 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hugetlbfs_reserv.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -============== -Hugetlbfs 预留 -============== - -概述 -==== - -:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 -示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 -时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 -持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 -没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 -简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 -样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” -巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 -中是如何进行巨页预留处理的。 - - -读者 -==== -这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 - - -数据结构 -======== - -resv_huge_pages - 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 - 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 -Reserve Map - 预留映射由以下结构体描述:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 - 区域。一个区域被描述为:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 - reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 -Flags for MAP_PRIVATE Reservations - 这些被存储在预留的映射指针的底部。 - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - 表示该任务是与该映射相关的预留的所有者。 - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 - 射了一个页面。 -Page Flags - PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 - “释放巨页” 一节中讨论。 - - -预留映射位置(私有或共享) -========================== - -一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 -(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, -预留映射的位置和语义是明显不同的。位置的差异是: - -- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 - 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 -- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 - 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 - 留映射。因此,预留映射在创建节点时被分配。 - - -创建预留 -======== -当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) -创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE -标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 - -参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 -的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' -参数已经被这个偏移量所调整。 - -PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 - -- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 - 修改。 -- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 - 因此,预留映射也可用于确定哪些预留已被消耗。 - -对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, -HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 - -预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 -然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, -请参见 :ref:`预留映射的修改 ` 一节。 - -该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 -有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` -一节。 - -在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 -并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 -些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages -会被调整,如下所示:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 - -如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 -反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, -不对预留映射进行修改,因为没有条目表示存在预留。 - -如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 -在'from'-'to'范围内存在预留。 - -消耗预留/分配一个巨页 -=========================== - -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() -中进行的:: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 -预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 -外拷贝被分配。 - - -调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 -细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 -vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 -如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, -则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 -确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 -的参数: - -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 -- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, - 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 - 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 - -与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, -free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: - - SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, - * 如果遇到错误,以至于必须释放这个页面,预留将被 - * 恢复。 */ - resv_huge_pages--; /* 减少全局预留计数 */ - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: -SetPagePrivate(page) 和 resv_huge_pages--. - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 -个新的条目。 - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 -SetPagePrivate(page)和resv_huge_pages-。 - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 -一个新的条目。 - -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 -vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 -享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 -这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 -识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 -参见 :ref:`预留映射帮助函数 ` 一节。 - - -实例化巨页 -========== - -在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 -存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, -当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 - - -释放巨页 -======== - -巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 -递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 -留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 - -page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 -应该被调整(关于如何设置这些标志的信息,请参见 -:ref: `消耗预留/分配一个巨页 ` )。 - - -该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 -传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 -因此,在这种情况下,全局resv_huge_pages计数器被递增。 - -如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 - -子池预留 -======== - -有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 -个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 - -当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 -如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 -hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) -被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 - -当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 -hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 -(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 -一个错误。 - -然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 -池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 -满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 -者要负责从全局池中获取另外两个页面。 - - -COW和预留 -========== - -由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 -以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 -自己的页。 - -当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 -页,分配会失败。 - -当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 -由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 -时,对预留的所有者和非所有者采取不同的行动。 - -在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 - -如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 -从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED -位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 -会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 - -预留映射的修改 -============== - -以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 -助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 -记录。这些函数是:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -在预留映射上的操作通常涉及两个操作: - -1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 - - 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 - -2) - a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 - [f, t]的预留映射。 - b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 - -注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 -成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 -成功。 - -如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 -到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 -同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 -行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 -和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 - -函数region_del()被调用以从预留映射中移除区域。 -它通常在以下情况下被调用: - -- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 - 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 -- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 - 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del - 的范围是[new_end_of_file, LONG_MAX]。 -- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 - 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 - 围是[page_idx, page_idx + 1]。 - -在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() -会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 -结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 -预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() -来调整计数器,使其与不能被删除的预留映射条目相对应。 - -region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 -因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 -(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 -失,子池和全局预留计数被未完成的预留数量所减去。 - -预留映射帮助函数 -================ - -有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 -地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 -映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, -它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 -先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 -返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 -现意外的差异,说明在两次调用之间修改了预留映射:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 -先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 -中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 -了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 -示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 -要做什么的更多信息,请参见 “错误路径中的预留清理” 。 - - -错误路径中的预留清理 -==================== - -正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 -先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 -如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, -一切都很好。 - -此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 -正确的。 - -然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, -页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 -(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 -显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, -并阻止分配一个预先分配的页面。 - -函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 -是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 -数相对应。 - -函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, -它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 -然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 -初设想的那样使用一个预留页。 - -有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, -它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 - - -预留和内存策略 -============== -当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 -在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 -完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 -用:: - - - /* - * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 - * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 - * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 - * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 - * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 - * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 - * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 - */ - -添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 -程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 -预留,也是如此。 - -Hugetlbfs回归测试 -================= - -最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 -libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 -libhugetlbfs中添加适当的测试。 - --- -Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst deleted file mode 100644 index c6e1e7bdb05b..000000000000 --- a/Documentation/translations/zh_CN/vm/hwpoison.rst +++ /dev/null @@ -1,166 +0,0 @@ - -:Original: Documentation/vm/hwpoison.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -hwpoison -======== - -什么是hwpoison? -=============== - - -即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 -一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 - -这个补丁包在虚拟机中实现了必要的(编程)框架。 - -引用概述中的评论:: - - 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 - 存或高速缓存故障。 - - 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 - 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 - 以安全地忽略它. 而不是用另外一个机器检查去处理它。 - - 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 - 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 - 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 - 这意味着错误处理可能需要很长的时间。 - - 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 - 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 - 以我们希望我们可以摆脱这种情况。 - -该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 -各种检查组成,用来处理poison的页面。 - -现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm -版本。 - -对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 -机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 -样做,但一些非常专业的应用程序可能会这样做。 - -故障恢复模式 -============ - -有两种(实际上是三种)模式的内存故障恢复可以在。 - -vm.memory_failure_recovery sysctl 置零: - 所有的内存故障都会导致panic。请不要尝试恢复。 - -早期处理 - (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 - 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 - 模式。 - -推迟处理 - 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 - 最好的,默认情况下注意一些页面总是被当作late kill处理。 - -用户控制 -======== - -vm.memory_failure_recovery - 参阅 sysctl.txt - -vm.memory_failure_early_kill - 全局启用early kill - -PR_MCE_KILL - 设置early/late kill mode/revert 到系统默认值。 - - arg1: PR_MCE_KILL_CLEAR: - 恢复到系统默认值 - arg1: PR_MCE_KILL_SET: - arg2定义了线程特定模式 - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - 使用系统全局默认值 - - 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 - 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 - -PR_MCE_KILL_GET - 返回当前模式 - -测试 -==== - -* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 - -* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 - - corrupt-pfn - 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 - 免在测试套件中损坏非预期页面。 - unpoison-pfn - 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 - 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 - - 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 - - corrupt-filter-dev-major, corrupt-filter-dev-minor - 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 - 配符值。这应该只用于人工注入的测试。 - - corrupt-filter-memcg - 限制注入到memgroup拥有的页面。由memcg的inode号指定。 - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 - 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 - 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 - Documentation/admin-guide/mm/pagemap.rst中记录。 - -* 架构特定的MCE注入器 - - x86 有 mce-inject, mce-test - - 在mce-test中的一些便携式hwpoison测试程序,见下文。 - -引用 -==== - -http://halobates.de/mce-lc09-2.pdf - 09年LinuxCon的概述演讲 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - 测试套件(在tsrc中的hwpoison特定可移植测试)。 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86特定的注入器 - - -限制 -==== -- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 - 复,目前只有LRU页。 - ---- -Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst deleted file mode 100644 index a1c6d529b6ff..000000000000 --- a/Documentation/translations/zh_CN/vm/index.rst +++ /dev/null @@ -1,54 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================= -Linux内存管理文档 -================= - -这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 -和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 -议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 -对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 -TODO:待引用文档集被翻译完毕后请及时修改此处) - -.. toctree:: - :maxdepth: 1 - - active_mm - balance - damon/index - free_page_reporting - highmem - ksm - frontswap - hmm - hwpoison - hugetlbfs_reserv - memory-model - mmu_notifier - numa - overcommit-accounting - page_frags - page_owner - page_table_check - remap_file_pages - split_page_table_lock - z3fold - zsmalloc - -TODOLIST: -* arch_pgtable_helpers -* free_page_reporting -* hugetlbfs_reserv -* page_migration -* slub -* transhuge -* unevictable-lru -* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/vm/ksm.rst deleted file mode 100644 index 83b0c73984da..000000000000 --- a/Documentation/translations/zh_CN/vm/ksm.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/ksm.rst - -:翻译: - - 徐鑫 xu xin - -============ -内核同页合并 -============ - -KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 -到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 -https://lwn.net/Articles/330589 - -KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst -文档中有描述。 - -设计 -==== - -概述 ----- - -概述内容请见mm/ksm.c文档中的“DOC: Overview” - -逆映射 ------- -KSM维护着稳定树中的KSM页的逆映射信息。 - -当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 -KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 -的 ``page->mapping`` 指向了该稳定树节点。 - -如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 -个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` -指向该"副本"。 - -每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 -内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 - -这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 -本身中不能有重复的KSM页面内容仍然是强制要求。 - -由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 -rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 -虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 -因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 -程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 -的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 -保留在"副本"列表的开头。 - -``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 -树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 -缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 - -``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 -的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 -法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 -那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 -能会减慢应用程序在KSM页面上的只读计算。 - -KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 -这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 - -参考 -==== -内核代码请见mm/ksm.c。 -涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst deleted file mode 100644 index 013e30c88d72..000000000000 --- a/Documentation/translations/zh_CN/vm/memory-model.rst +++ /dev/null @@ -1,135 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/memory-model.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============ -物理内存模型 -============ - -系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 -始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 -小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 -同的内存库连接到不同的CPU。 - -Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 -个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 -覆盖该默认值。 - -所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 -帧的状态。 - -无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 -在一对一的映射关系。 - -每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` -帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 - -FLATMEM -======= - -最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 -物理内存。 - -在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 -于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` -对象从未被完全初始化。 - -为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 -然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 -将所有的内存交给页分配器。 - -一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 -定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 - -使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` -是 `mem_map` 数组的一个索引。 - -`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 - -SPARSEMEM -========= - -SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, -如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 -初始化。 - -SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 -体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` -阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 -和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 -来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` -是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 -意的值。 - -最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 -大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: - -* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 - `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 -* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 - 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 - 内存区。 - -架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 - -通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 - "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 - 值决定。 - -Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 -框的段。在一个区段内,PFN是指向页数组的索引。 - -Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 -作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` -对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 - -为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 -确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, -它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, -它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 - -虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 -配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 -vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 -:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 - -ZONE_DEVICE -=========== -`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 -围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 -事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 -是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , -为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, -:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 -用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` -空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 - -虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 -`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` -内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 -上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 -:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 -:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 - -`ZONE_DEVICE` 的用户是: - -* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 - -* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , - 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见/vm/hmm.rst。 - -* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 - 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst deleted file mode 100644 index b29a37b33628..000000000000 --- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst +++ /dev/null @@ -1,97 +0,0 @@ -:Original: Documentation/vm/mmu_notifier.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -什么时候需要页表锁内通知? -========================== - -当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 -mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 - -对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 -IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 -表锁的同时通知这些二级TLB: - - A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 - B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 - -情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 - -情况B更加微妙。为了正确起见,它需要按照以下序列发生: - - - 上页表锁 - - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) - - 设置页表项以指向新页 - -如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 -C++11等内存模型。 - -考虑以下情况(设备使用类似于ATS/PASID的功能)。 - -两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 -写保护(B的其他情况也适用)。 - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {尝试写到addrA} - CPU-thread-1 {尝试写到addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {读取addrA并填充设备TLB} - DEV-thread-2 {读取addrB并填充设备TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} - CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {写入addrA,这是对新页面的写入} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {写入addrB,这是一个写入新页的过程} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {从旧页中读取addrA} - DEV-thread-2 {从新页面读取addrB} - -所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 -就看到了addrB的新值。这就破坏了设备的总内存序。 - -当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range -调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 -在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst deleted file mode 100644 index 6af412b924ad..000000000000 --- a/Documentation/translations/zh_CN/vm/numa.rst +++ /dev/null @@ -1,101 +0,0 @@ -:Original: Documentation/vm/numa.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -始于1999年11月,作者: - -========================== -何为非统一内存访问(NUMA)? -========================== - -这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 - -从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 -本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 -本文中称这些组件/装配为“单元”。 - -每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 -不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 -链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 -他单元有多个距离。 - -对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, -所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 -系统互连在硬件中处理。 - -内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 -有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 -更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 - -平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 -内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 -[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 - -这就自然而然有了Linux软件对NUMA系统的视角: - -Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 -上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO -总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 -的访问时间和更高的有效带宽。 - -对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 -的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 -的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 - -此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 -有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 -分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, -可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 -使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE -中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 -时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 -“fallback 回退”。 - -由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 -节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, -代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 -离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 - -默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 -图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 -满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 - -本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 -一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 -中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 -远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 -以在节点之间迁移,远离其初始节点和内核数据结构。 - -系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 -序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 -Linux NUMA内存策略修改内核的默认本地分配行为。 [见 -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 -的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 -内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 -将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 -最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 -其他附近的节点。 - -一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 -得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 - -一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 -节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 -路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 -内核分析子系统就是这样的一个例子。 - -如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 -子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 -以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 -一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst deleted file mode 100644 index 8765cb118f24..000000000000 --- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst +++ /dev/null @@ -1,86 +0,0 @@ -:Original: Documentation/vm/overcommit-accounting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -============== -超量使用审计 -============== - -Linux内核支持下列超量使用处理模式 - -0 - 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 - 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, - 允许root分配稍多的内存。这是默认的。 -1 - 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 - 几乎完全由零页组成的虚拟内存 - -2 - 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 - (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 - 不会被杀死,但会在内存分配上收到相应的错误。 - - 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 - 是很有用的。 - -超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 - -可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) -来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 - -在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 -的超量使用和提交量。 - -陷阱 -==== - -C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, -你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 -不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 - - -在模式2中,MAP_NORESERVE标志被忽略。 - - -它是如何工作的 -============== - -超量使用是基于以下规则 - -对于文件映射 - | SHARED or READ-only - 0 cost (该文件是映射而不是交换) - | PRIVATE WRITABLE - 每个实例的映射大小 - -对于匿名或者 ``/dev/zero`` 映射 - | SHARED - 映射的大小 - | PRIVATE READ-only - 0 cost (但作用不大) - | PRIVATE WRITABLE - 每个实例的映射大小 - -额外的计数 - | 通过mmap制作可写副本的页面 - | 从同一池中提取的shmfs内存 - -状态 -==== - -* 我们核算mmap内存映射 -* 我们核算mprotect在提交中的变化 -* 我们核算mremap的大小变化 -* 我们的审计 brk -* 审计munmap -* 我们在/proc中报告commit 状态 -* 核对并检查分叉的情况 -* 审查堆栈处理/执行中的构建 -* 叙述SHMfs的情况 -* 实现实际限制的执行 - -待续 -==== -* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst deleted file mode 100644 index ad27fed33634..000000000000 --- a/Documentation/translations/zh_CN/vm/page_frags.rst +++ /dev/null @@ -1,38 +0,0 @@ -:Original: Documentation/vm/page_frag.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页面片段 -======== - -一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 -该页中的多个碎片在该页的引用计数器中被单独计算。 - -page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 -的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 -sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 - -为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, -并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, -这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 -个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 - -网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 -netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache -被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 -它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 -将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 - -许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 -缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, -__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 -这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 -get_page。 - -Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst deleted file mode 100644 index 9e951fabba9d..000000000000 --- a/Documentation/translations/zh_CN/vm/page_owner.rst +++ /dev/null @@ -1,116 +0,0 @@ -:Original: Documentation/vm/page_owner.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================ -page owner: 跟踪谁分配的每个页面 -================================ - -概述 -==== - -page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 -当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 -当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 - -尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 -相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 -动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 -可能性,而不是仅仅保留在内存中,所以不利于调试。 - -页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 -统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 - -page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline -中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 -选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 -需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 -热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 -一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 -功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 - -- 没有page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- 有page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath -中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 - -有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 -个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 -之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 -页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 -少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 -个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 -之后,没有任何页面处于未追踪状态。 - -使用方法 -======== - -1) 构建用户空间的帮助:: - - cd tools/vm - make page_owner_sort - -2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. - -3) 做你想调试的工作。 - -4) 分析来自页面所有者的信息:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 - 取页序值,计算buf的次数和页数,最后根据参数进行排序。 - - 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 - 按buf的页数排序,请使用-m参数。详细的参数是: - - 基本函数: - - Sort: - -a 按内存分配时间排序 - -m 按总内存排序 - -p 按pid排序。 - -P 按tgid排序。 - -r 按内存释放时间排序。 - -s 按堆栈跟踪排序。 - -t 按时间排序(默认)。 - - 其它函数: - - Cull: - -c 通过比较堆栈跟踪而不是总块来进行剔除。 - - Filter: - -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst deleted file mode 100644 index a29fc1b360e6..000000000000 --- a/Documentation/translations/zh_CN/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/page_table_check.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页表检查 -======== - -概述 -==== - -页表检查允许通过确保防止某些类型的内存损坏来强化内核。 - -当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 -的验证。 - -在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 -的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 -试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 - -双重映射检测逻辑 -================ - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -启用页表检查 -============ - -用以下方法构建内核: - -- PAGE_TABLE_CHECK=y - 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 - -- 使用 "page_table_check=on" 内核参数启动。 - -可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 -支持。 diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst deleted file mode 100644 index af6b7e28af23..000000000000 --- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst +++ /dev/null @@ -1,32 +0,0 @@ -:Original: Documentation/vm/remap_file_pages.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============================== -remap_file_pages()系统调用 -============================== - -remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, -文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 -处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 - -支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 -路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 -偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 -源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 - -幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS -实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 -由于64位系统的广泛使用,这种使用情况已经不重要了。 - -syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 -对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 - -仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 -vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT -的注释。 diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst deleted file mode 100644 index 50694d97c426..000000000000 --- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst +++ /dev/null @@ -1,96 +0,0 @@ -:Original: Documentation/vm/split_page_table_lock.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================= -分页表锁(split page table lock) -================================= - -最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 -法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 -展性,我们引入了分页表锁。 - -有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 -PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 - -有一些辅助工具来锁定/解锁一个表和其他访问器函数: - - - pte_offset_map_lock() - 映射pte并获取PTE表锁,返回所取锁的指针; - - pte_unmap_unlock() - 解锁和解映射PTE表; - - pte_alloc_map_lock() - 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 - 或NULL; - - pte_lockptr() - 返回指向PTE表锁的指针; - - pmd_lock() - 取得PMD表锁,返回所取锁的指针。 - - pmd_lockptr() - 返回指向PMD表锁的指针; - -如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 -时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock -来保护。 - -如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 -下文)。 - -Hugetlb 和分页表锁 -================== - -Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 - -Hugetlb特定的辅助函数: - - - huge_pte_lock() - 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; - - huge_pte_lockptr() - 返回指向表锁的指针。 - -架构对分页表锁的支持 -==================== - -没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() -和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 - -确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 -面。这个区域与page->ptl共享存储。 - -PMD分页锁只有在你有两个以上的页表级别时才有意义。 - -启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 -用pgtable_pmd_page_dtor()。 - -分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() -中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 -分配一些PMD。 - -一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 - -注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 -须正确处理。 - -page->ptl -========= - -page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 -与page->private(以及union中的其他几个字段)共享存储。 - -为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: - - - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 - 就可以避免间接访问并节省一个缓存行。 - - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t - 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 - 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 - -PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t -分配在pgtable_pmd_page_ctor()中。 - -请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst deleted file mode 100644 index 57204aa08caa..000000000000 --- a/Documentation/translations/zh_CN/vm/z3fold.rst +++ /dev/null @@ -1,31 +0,0 @@ -:Original: Documentation/vm/z3fold.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -====== -z3fold -====== - -z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 -它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 - -z3fold和zbud的主要区别是: - -* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 -* z3fold在其页面中最多可以容纳3个压缩页面 -* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 - -为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 -它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 -率是1.7倍左右。 - -不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 -返回一个无符号长句柄,它编码了被分配对象的实际位置。 - -保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 -为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst deleted file mode 100644 index 29e9c70a8eb6..000000000000 --- a/Documentation/translations/zh_CN/vm/zsmalloc.rst +++ /dev/null @@ -1,78 +0,0 @@ -:Original: Documentation/vm/zs_malloc.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -======== -zsmalloc -======== - -这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, -它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 -是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 -占据整个页面。这是其前身(xvmalloc)的主要问题之一。 - -为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 -们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order -页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 - -为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 -要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 -个大小的分配请求,会返回失败(见zs_malloc)。 - -此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 -长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 -映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 -的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 -zs_unmap_object()解除映射。 - -stat -==== - -通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` -看到zsmalloc内部信息。下面是一个统计输出的例子。:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - 索引 -size - zspage存储对象大小 -almost_empty - ZS_ALMOST_EMPTY zspage的数量(见下文)。 -almost_full - ZS_ALMOST_FULL zspage的数量(见下图) -obj_allocated - 已分配对象的数量 -obj_used - 分配给用户的对象的数量 -pages_used - 为该类分配的页数 -pages_per_zspage - 组成一个zspage的0-order页面的数量 - -当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 - -* n = 已分配对象的数量 -* N = zspage可以存储的对象总数 -* f = fullness_threshold_frac(即,目前是4个) - -同样地,我们将zspage分配给: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst index e1ce9d8c06f8..e97d7d578751 100644 --- a/Documentation/translations/zh_TW/index.rst +++ b/Documentation/translations/zh_TW/index.rst @@ -128,7 +128,7 @@ TODOList: * security/index * sound/index * crypto/index -* vm/index +* mm/index * bpf/index * usb/index * PCI/index diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore deleted file mode 100644 index bc74f5643008..000000000000 --- a/Documentation/vm/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -page-types -slabinfo diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst deleted file mode 100644 index 6f8269c284ed..000000000000 --- a/Documentation/vm/active_mm.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _active_mm: - -========= -Active MM -========= - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - Cc'd to linux-kernel, because I don't write explanations all that often, - and when I do I feel better about more people reading them. - - On Fri, 30 Jul 1999, David Mosberger wrote: - > - > Is there a brief description someplace on how "mm" vs. "active_mm" in - > the task_struct are supposed to be used? (My apologies if this was - > discussed on the mailing lists---I just returned from vacation and - > wasn't able to follow linux-kernel for a while). - - Basically, the new setup is: - - - we have "real address spaces" and "anonymous address spaces". The - difference is that an anonymous address space doesn't care about the - user-level page tables at all, so when we do a context switch into an - anonymous address space we just leave the previous address space - active. - - The obvious use for a "anonymous address space" is any thread that - doesn't need any user mappings - all kernel threads basically fall into - this category, but even "real" threads can temporarily say that for - some amount of time they are not going to be interested in user space, - and that the scheduler might as well try to avoid wasting time on - switching the VM state around. Currently only the old-style bdflush - sync does that. - - - "tsk->mm" points to the "real address space". For an anonymous process, - tsk->mm will be NULL, for the logical reason that an anonymous process - really doesn't _have_ a real address space at all. - - - however, we obviously need to keep track of which address space we - "stole" for such an anonymous user. For that, we have "tsk->active_mm", - which shows what the currently active address space is. - - The rule is that for a process with a real address space (ie tsk->mm is - non-NULL) the active_mm obviously always has to be the same as the real - one. - - For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the - "borrowed" mm while the anonymous process is running. When the - anonymous process gets scheduled away, the borrowed address space is - returned and cleared. - - To support all that, the "struct mm_struct" now has two counters: a - "mm_users" counter that is how many "real address space users" there are, - and a "mm_count" counter that is the number of "lazy" users (ie anonymous - users) plus one if there are any real users. - - Usually there is at least one real user, but it could be that the real - user exited on another CPU while a lazy user was still active, so you do - actually get cases where you have a address space that is _only_ used by - lazy users. That is often a short-lived state, because once that thread - gets scheduled away in favour of a real thread, the "zombie" mm gets - released because "mm_count" becomes zero. - - Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any - more. "init_mm" should be considered just a "lazy context when no other - context is available", and in fact it is mainly used just at bootup when - no real VM has yet been created. So code that used to check - - if (current->mm == &init_mm) - - should generally just do - - if (!current->mm) - - instead (which makes more sense anyway - the test is basically one of "do - we have a user context", and is generally done by the page fault handler - and things like that). - - Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, - because it slightly changes the interfaces to accommodate the alpha (who - would have thought it, but the alpha actually ends up having one of the - ugliest context switch codes - unlike the other architectures where the MM - and register state is separate, the alpha PALcode joins the two, and you - need to switch both together). - - (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst deleted file mode 100644 index cbaee9e59241..000000000000 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ /dev/null @@ -1,260 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _arch_page_table_helpers: - -=============================== -Architecture Page Table Helpers -=============================== - -Generic MM expects architectures (with MMU) to provide helpers to create, access -and modify page table entries at various level for different memory functions. -These page table helpers need to conform to a common semantics across platforms. -Following tables describe the expected semantics which can also be tested during -boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug -test need to be in sync. - - -PTE Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pte_same | Tests whether both PTE entries are the same | -+---------------------------+--------------------------------------------------+ -| pte_bad | Tests a non-table mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_present | Tests a valid mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_young | Tests a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_dirty | Tests a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_write | Tests a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_special | Tests a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_protnone | Tests a PROT_NONE PTE | -+---------------------------+--------------------------------------------------+ -| pte_devmap | Tests a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_soft_dirty | Tests a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkyoung | Creates a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkold | Creates an old PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdirty | Creates a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkclean | Creates a clean PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkwrite | Creates a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_wrprotect | Creates a write protected PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkspecial | Creates a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mksoft_dirty | Creates a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_clear_soft_dirty | Clears a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mknotpresent | Invalidates a mapped PTE | -+---------------------------+--------------------------------------------------+ -| ptep_clear | Clears a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears and returns PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | -+---------------------------+--------------------------------------------------+ -| ptep_test_and_clear_young | Clears young from a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_wrprotect | Converts into a write protected PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_access_flags | Converts into a more permissive PTE | -+---------------------------+--------------------------------------------------+ - - -PMD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pmd_same | Tests whether both PMD entries are the same | -+---------------------------+--------------------------------------------------+ -| pmd_bad | Tests a non-table mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_leaf | Tests a leaf mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_huge | Tests a HugeTLB mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | -+---------------------------+--------------------------------------------------+ -| pmd_present | Tests a valid mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_young | Tests a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_dirty | Tests a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_write | Tests a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_special | Tests a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_protnone | Tests a PROT_NONE PMD | -+---------------------------+--------------------------------------------------+ -| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_soft_dirty | Tests a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkyoung | Creates a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkold | Creates an old PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdirty | Creates a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkclean | Creates a clean PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkwrite | Creates a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_wrprotect | Creates a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkspecial | Creates a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mksoft_dirty | Creates a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_clear_soft_dirty | Clears a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkinvalid | Invalidates a mapped PMD [1] | -+---------------------------+--------------------------------------------------+ -| pmd_set_huge | Creates a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmd_clear_huge | Clears a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear_full | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_test_and_clear_young | Clears young from a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_wrprotect | Converts into a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_access_flags | Converts into a more permissive PMD | -+---------------------------+--------------------------------------------------+ - - -PUD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pud_same | Tests whether both PUD entries are the same | -+---------------------------+--------------------------------------------------+ -| pud_bad | Tests a non-table mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_leaf | Tests a leaf mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_huge | Tests a HugeTLB mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | -+---------------------------+--------------------------------------------------+ -| pud_present | Tests a valid mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_young | Tests a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_dirty | Tests a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_write | Tests a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_devmap | Tests a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkyoung | Creates a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkold | Creates an old PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdirty | Creates a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkclean | Creates a clean PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkwrite | Creates a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_wrprotect | Creates a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkinvalid | Invalidates a mapped PUD [1] | -+---------------------------+--------------------------------------------------+ -| pud_set_huge | Creates a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pud_clear_huge | Clears a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear_full | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_test_and_clear_young | Clears young from a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_wrprotect | Converts into a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_access_flags | Converts into a more permissive PUD | -+---------------------------+--------------------------------------------------+ - - -HugeTLB Page Table Helpers -========================== - -+---------------------------+--------------------------------------------------+ -| pte_huge | Tests a HugeTLB | -+---------------------------+--------------------------------------------------+ -| pte_mkhuge | Creates a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_dirty | Tests a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_write | Tests a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkdirty | Creates a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkwrite | Creates a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_wrprotect | Creates a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_get_and_clear | Clears a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | -+---------------------------+--------------------------------------------------+ - - -SWAP Page Table Helpers -======================== - -+---------------------------+--------------------------------------------------+ -| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | -+---------------------------+--------------------------------------------------+ -| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | -+---------------------------+--------------------------------------------------+ -| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| is_migration_entry | Tests a migration (read or write) swapped entry | -+-------------------------------+----------------------------------------------+ -| is_writable_migration_entry | Tests a write migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_readable_migration_entry | Creates a read migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_writable_migration_entry | Creates a write migration swapped entry | -+-------------------------------+----------------------------------------------+ - -[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst deleted file mode 100644 index 6a1fadf3e173..000000000000 --- a/Documentation/vm/balance.rst +++ /dev/null @@ -1,102 +0,0 @@ -.. _balance: - -================ -Memory Balancing -================ - -Started Jan 2000 by Kanoj Sarcar - -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as -well as for non __GFP_IO allocations. - -The first reason why a caller may avoid reclaim is that the caller can not -sleep due to holding a spinlock or is in interrupt context. The second may -be that the caller is willing to fail the allocation without incurring the -overhead of page reclaim. This may happen for opportunistic high-order -allocation requests that have order-0 fallback options. In such cases, -the caller may also wish to avoid waking kswapd. - -__GFP_IO allocation requests are made to prevent file system deadlocks. - -In the absence of non sleepable allocation requests, it seems detrimental -to be doing balancing. Page reclamation can be kicked off lazily, that -is, only when needed (aka zone free memory is 0), instead of making it -a proactive process. - -That being said, the kernel should try to fulfill requests for direct -mapped pages from the direct mapped pool, instead of falling back on -the dma pool, so as to keep the dma pool filled for dma requests (atomic -or not). A similar argument applies to highmem and direct mapped pages. -OTOH, if there is a lot of free dma pages, it is preferable to satisfy -regular memory requests by allocating one from the dma pool, instead -of incurring the overhead of regular zone balancing. - -In 2.2, memory balancing/page reclamation would kick off only when the -_total_ number of free pages fell below 1/64 th of total memory. With the -right ratio of dma and regular memory, it is quite possible that balancing -would not be done even when the dma zone was completely empty. 2.2 has -been running production machines of varying memory sizes, and seems to be -doing fine even with the presence of this problem. In 2.3, due to -HIGHMEM, this problem is aggravated. - -In 2.3, zone balancing can be done in one of two ways: depending on the -zone size (and possibly of the size of lower class zones), we can decide -at init time how many free pages we should aim for while balancing any -zone. The good part is, while balancing, we do not need to look at sizes -of lower class zones, the bad part is, we might do too frequent balancing -due to ignoring possibly lower usage in the lower class zones. Also, -with a slight change in the allocation routine, it is possible to reduce -the memclass() macro to be a simple equality. - -Another possible solution is that we balance only when the free memory -of a zone _and_ all its lower class zones falls below 1/64th of the -total memory in the zone and its lower class zones. This fixes the 2.2 -balancing problem, and stays as close to 2.2 behavior as possible. Also, -the balancing algorithm works the same way on the various architectures, -which have different numbers and types of zones. If we wanted to get -fancy, we could assign different weights to free pages in different -zones in the future. - -Note that if the size of the regular zone is huge compared to dma zone, -it becomes less significant to consider the free dma pages while -deciding whether to balance the regular zone. The first solution -becomes more attractive then. - -The appended patch implements the second solution. It also "fixes" two -problems: first, kswapd is woken up as in 2.2 on low memory conditions -for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, -so as to give a fighting chance for replace_with_highmem() to get a -HIGHMEM page, as well as to ensure that HIGHMEM allocations do not -fall back into regular zone. This also makes sure that HIGHMEM pages -are not leaked (for example, in situations where a HIGHMEM page is in -the swapcache but is not being used by anyone) - -kswapd also needs to know about the zones it should balance. kswapd is -primarily needed in a situation where balancing can not be done, -probably because all allocation requests are coming from intr context -and all process contexts are sleeping. For 2.3, kswapd does not really -need to balance the highmem zone, since intr context does not request -highmem pages. kswapd looks at the zone_wake_kswapd field in the zone -structure to decide whether a zone needs balancing. - -Page stealing from process memory and shm is done if stealing the page would -alleviate memory pressure on any zone in the page's node that has fallen below -its watermark. - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These -are per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below watermark[WMARK_MIN], the hysteric field -low_on_memory gets set. This stays set till the number of free pages becomes -watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will -try to free some pages in the zone (providing GFP_WAIT is set in the request). -Orthogonal to this, is the decision to poke kswapd to free some zone pages. -That decision is not hysteresis based, and is done when the number of free -pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. - - -(Good) Ideas that I have heard: - -1. Dynamic experience should influence balancing: number of failed requests - for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) -2. Implement a replace_with_highmem()-like replace_with_regular() to preserve - dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst deleted file mode 100644 index eb2b31eedfa1..000000000000 --- a/Documentation/vm/bootmem.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Boot Memory -=========== diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst deleted file mode 100644 index 08f34df45523..000000000000 --- a/Documentation/vm/damon/api.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -API Reference -============= - -Kernel space programs can use every feature of DAMON using below APIs. All you -need to do is including ``damon.h``, which is located in ``include/linux/`` of -the source tree. - -Structures -========== - -.. kernel-doc:: include/linux/damon.h - - -Functions -========= - -.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst deleted file mode 100644 index 0cff6fac6b7e..000000000000 --- a/Documentation/vm/damon/design.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====== -Design -====== - -Configurable Layers -=================== - -DAMON provides data access monitoring functionality while making the accuracy -and the overhead controllable. The fundamental access monitorings require -primitives that dependent on and optimized for the target address space. On -the other hand, the accuracy and overhead tradeoff mechanism, which is the core -of DAMON, is in the pure logic space. DAMON separates the two parts in -different layers and defines its interface to allow various low level -primitives implementations configurable with the core logic. We call the low -level primitives implementations monitoring operations. - -Due to this separated design and the configurable interface, users can extend -DAMON for any address space by configuring the core logics with appropriate -monitoring operations. If appropriate one is not provided, users can implement -the operations on their own. - -For example, physical memory, virtual memory, swap space, those for specific -processes, NUMA nodes, files, and backing memory devices would be supportable. -Also, if some architectures or devices support special optimized access check -primitives, those will be easily configurable. - - -Reference Implementations of Address Space Specific Monitoring Operations -========================================================================= - -The monitoring operations are defined in two parts: - -1. Identification of the monitoring target address range for the address space. -2. Access check of specific address range in the target space. - -DAMON currently provides the implementations of the operations for the physical -and virtual address spaces. Below two subsections describe how those work. - - -VMA-based Target Address Range Construction -------------------------------------------- - -This is only for the virtual address space monitoring operations -implementation. That for the physical address space simply asks users to -manually set the monitoring target address ranges. - -Only small parts in the super-huge virtual address space of the processes are -mapped to the physical memory and accessed. Thus, tracking the unmapped -address regions is just wasteful. However, because DAMON can deal with some -level of noise using the adaptive regions adjustment mechanism, tracking every -mapping is not strictly required but could even incur a high overhead in some -cases. That said, too huge unmapped areas inside the monitoring target should -be removed to not take the time for the adaptive mechanism. - -For the reason, this implementation converts the complex mappings to three -distinct regions that cover every mapped area of the address space. The two -gaps between the three regions are the two biggest unmapped areas in the given -address space. The two biggest unmapped areas would be the gap between the -heap and the uppermost mmap()-ed region, and the gap between the lowermost -mmap()-ed region and the stack in most of the cases. Because these gaps are -exceptionally huge in usual address spaces, excluding these will be sufficient -to make a reasonable trade-off. Below shows this in detail:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -PTE Accessed-bit Based Access Check ------------------------------------ - -Both of the implementations for physical and virtual address spaces use PTE -Accessed-bit for basic access checks. Only one difference is the way of -finding the relevant PTE Accessed bit(s) from the address. While the -implementation for the virtual address walks the page table for the target task -of the address, the implementation for the physical address walks every page -table having a mapping to the address. In this way, the implementations find -and clear the bit(s) for next sampling target address and checks whether the -bit(s) set again after one sampling period. This could disturb other kernel -subsystems using the Accessed bits, namely Idle page tracking and the reclaim -logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling -the interference is the responsibility of sysadmins. However, it solves the -conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, -as Idle page tracking does. - - -Address Space Independent Core Mechanisms -========================================= - -Below four sections describe each of the DAMON core mechanisms and the five -monitoring attributes, ``sampling interval``, ``aggregation interval``, -``update interval``, ``minimum number of regions``, and ``maximum number of -regions``. - - -Access Frequency Monitoring ---------------------------- - -The output of DAMON says what pages are how frequently accessed for a given -duration. The resolution of the access frequency is controlled by setting -``sampling interval`` and ``aggregation interval``. In detail, DAMON checks -access to each page per ``sampling interval`` and aggregates the results. In -other words, counts the number of the accesses to each page. After each -``aggregation interval`` passes, DAMON calls callback functions that previously -registered by users so that users can read the aggregated results and then -clears the results. This can be described in below simple pseudo-code:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -The monitoring overhead of this mechanism will arbitrarily increase as the -size of the target workload grows. - - -Region Based Sampling ---------------------- - -To avoid the unbounded increase of the overhead, DAMON groups adjacent pages -that assumed to have the same access frequencies into a region. As long as the -assumption (pages in a region have the same access frequencies) is kept, only -one page in the region is required to be checked. Thus, for each ``sampling -interval``, DAMON randomly picks one page in each region, waits for one -``sampling interval``, checks whether the page is accessed meanwhile, and -increases the access frequency of the region if so. Therefore, the monitoring -overhead is controllable by setting the number of regions. DAMON allows users -to set the minimum and the maximum number of regions for the trade-off. - -This scheme, however, cannot preserve the quality of the output if the -assumption is not guaranteed. - - -Adaptive Regions Adjustment ---------------------------- - -Even somehow the initial monitoring target regions are well constructed to -fulfill the assumption (pages in same region have similar access frequencies), -the data access pattern can be dynamically changed. This will result in low -monitoring quality. To keep the assumption as much as possible, DAMON -adaptively merges and splits each region based on their access frequency. - -For each ``aggregation interval``, it compares the access frequencies of -adjacent regions and merges those if the frequency difference is small. Then, -after it reports and clears the aggregated access frequency of each region, it -splits each region into two or three regions if the total number of regions -will not exceed the user-specified maximum number of regions after the split. - -In this way, DAMON provides its best-effort quality and minimal overhead while -keeping the bounds users set for their trade-off. - - -Dynamic Target Space Updates Handling -------------------------------------- - -The monitoring target address range could dynamically changed. For example, -virtual memory could be dynamically mapped and unmapped. Physical memory could -be hot-plugged. - -As the changes could be quite frequent in some cases, DAMON allows the -monitoring operations to check dynamic changes including memory mapping changes -and applies it to monitoring operations-related data structures such as the -abstracted monitoring target memory area only for each of a user-specified time -interval (``update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst deleted file mode 100644 index dde7e2414ee6..000000000000 --- a/Documentation/vm/damon/faq.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -Frequently Asked Questions -========================== - -Why a new subsystem, instead of extending perf or other user space tools? -========================================================================= - -First, because it needs to be lightweight as much as possible so that it can be -used online, any unnecessary overhead such as kernel - user space context -switching cost should be avoided. Second, DAMON aims to be used by other -programs including the kernel. Therefore, having a dependency on specific -tools like perf is not desirable. These are the two biggest reasons why DAMON -is implemented in the kernel space. - - -Can 'idle pages tracking' or 'perf mem' substitute DAMON? -========================================================= - -Idle page tracking is a low level primitive for access check of the physical -address space. 'perf mem' is similar, though it can use sampling to minimize -the overhead. On the other hand, DAMON is a higher-level framework for the -monitoring of various address spaces. It is focused on memory management -optimization and provides sophisticated accuracy/overhead handling mechanisms. -Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of -DAMON's output, but cannot substitute DAMON. - - -Does DAMON support virtual memory only? -======================================= - -No. The core of the DAMON is address space independent. The address space -specific monitoring operations including monitoring target regions -constructions and actual access checks can be implemented and configured on the -DAMON core by the users. In this way, DAMON users can monitor any address -space with any access check technique. - -Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based -implementations of the address space dependent functions for the virtual memory -and the physical memory by default, for a reference and convenient use. - - -Can I simply monitor page granularity? -====================================== - -Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the -working set size divided by the page size. Because the monitoring target -regions size is forced to be ``>=page size``, the region split will make no -effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst deleted file mode 100644 index 48c0bbff98b2..000000000000 --- a/Documentation/vm/damon/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -DAMON: Data Access MONitor -========================== - -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - - - *accurate* (the monitoring output is useful enough for DRAM level memory - management; It might not appropriate for CPU Cache levels, though), - - *light-weight* (the monitoring overhead is low enough to be applied online), - and - - *scalable* (the upper-bound of the overhead is in constant range regardless - of the size of target workloads). - -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. - -.. toctree:: - :maxdepth: 2 - - faq - design - api diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst deleted file mode 100644 index 8c05e62d8b2b..000000000000 --- a/Documentation/vm/free_page_reporting.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _free_page_reporting: - -===================== -Free Page Reporting -===================== - -Free page reporting is an API by which a device can register to receive -lists of pages that are currently unused by the system. This is useful in -the case of virtualization where a guest is then able to use this data to -notify the hypervisor that it is no longer using certain pages in memory. - -For the driver, typically a balloon driver, to use of this functionality -it will allocate and initialize a page_reporting_dev_info structure. The -field within the structure it will populate is the "report" function -pointer used to process the scatterlist. It must also guarantee that it can -handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per -call to the function. A call to page_reporting_register will register the -page reporting interface with the reporting framework assuming no other -page reporting devices are already registered. - -Once registered the page reporting API will begin reporting batches of -pages to the driver. The API will start reporting pages 2 seconds after -the interface is registered and will continue to do so 2 seconds after any -page of a sufficiently high order is freed. - -Pages reported will be stored in the scatterlist passed to the reporting -function with the final entry having the end bit set in entry nent - 1. -While pages are being processed by the report function they will not be -accessible to the allocator. Once the report function has been completed -the pages will be returned to the free area from which they were obtained. - -Prior to removing a driver that is making use of free page reporting it -is necessary to call page_reporting_unregister to have the -page_reporting_dev_info structure that is currently in use by free page -reporting removed. Doing this will prevent further reports from being -issued via the interface. If another driver or the same driver is -registered it is possible for it to resume where the previous driver had -left off in terms of reporting free pages. - -Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst deleted file mode 100644 index feecc5e24477..000000000000 --- a/Documentation/vm/frontswap.rst +++ /dev/null @@ -1,266 +0,0 @@ -.. _frontswap: - -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device". - -Frontswap with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst deleted file mode 100644 index c9887f241c6c..000000000000 --- a/Documentation/vm/highmem.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. _highmem: - -==================== -High Memory Handling -==================== - -By: Peter Zijlstra - -.. contents:: :local: - -What Is High Memory? -==================== - -High memory (highmem) is used when the size of physical memory approaches or -exceeds the maximum size of virtual memory. At that point it becomes -impossible for the kernel to keep all of the available physical memory mapped -at all times. This means the kernel needs to start using temporary mappings of -the pieces of physical memory that it wants to access. - -The part of (physical) memory not covered by a permanent mapping is what we -refer to as 'highmem'. There are various architecture dependent constraints on -where exactly that border lies. - -In the i386 arch, for example, we choose to map the kernel into every process's -VM space so that we don't have to pay the full TLB invalidation costs for -kernel entry/exit. This means the available virtual memory space (4GiB on -i386) has to be divided between user and kernel space. - -The traditional split for architectures using this approach is 3:1, 3GiB for -userspace and the top 1GiB for kernel space:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -This means that the kernel can at most map 1GiB of physical memory at any one -time, but because we need virtual address space for other things - including -temporary maps to access the rest of the physical memory - the actual direct -map will typically be less (usually around ~896MiB). - -Other architectures that have mm context tagged TLBs can have separate kernel -and user maps. Some hardware (like some ARMs), however, have limited virtual -space when they use mm context tags. - - -Temporary Virtual Mappings -========================== - -The kernel contains several ways of creating temporary mappings. The following -list shows them in order of preference of use. - -* kmap_local_page(). This function is used to require short term mappings. - It can be invoked from any context (including interrupts) but the mappings - can only be used in the context which acquired them. - - This function should be preferred, where feasible, over all the others. - - These mappings are thread-local and CPU-local, meaning that the mapping - can only be accessed from within this thread and the thread is bound the - CPU while the mapping is active. Even if the thread is preempted (since - preemption is never disabled by the function) the CPU can not be - unplugged from the system via CPU-hotplug until the mapping is disposed. - - It's valid to take pagefaults in a local kmap region, unless the context - in which the local mapping is acquired does not allow it for other reasons. - - kmap_local_page() always returns a valid virtual address and it is assumed - that kunmap_local() will never fail. - - Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain - extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered - because the map implementation is stack based. See kmap_local_page() kdocs - (included in the "Functions" section) for details on how to manage nested - mappings. - -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. - - kmap_atomic() may also be used by interrupt contexts, since it does not - sleep and the callers too may not sleep until after kunmap_atomic() is - called. - - Each call of kmap_atomic() in the kernel creates a non-preemptible section - and disable pagefaults. This could be a source of unwanted latency. Therefore - users should prefer kmap_local_page() instead of kmap_atomic(). - - It is assumed that k[un]map_atomic() won't fail. - -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). - - Mapping changes must be propagated across all the CPUs. kmap() also - requires global TLB invalidation when the kmap's pool wraps and it might - block when the mapping space is fully utilized until a slot becomes - available. Therefore, kmap() is only callable from preemptible context. - - All the above work is necessary if a mapping must last for a relatively - long time but the bulk of high-memory mappings in the kernel are - short-lived and only used in one place. This means that the cost of - kmap() is mostly wasted in such cases. kmap() was not intended for long - term mappings but it has morphed in that direction and its use is - strongly discouraged in newer code and the set of the preceding functions - should be preferred. - - On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have - no real work to do because a 64-bit address space is more than sufficient to - address all the physical memory whose pages are permanently mapped. - -* vmap(). This can be used to make a long duration mapping of multiple - physical pages into a contiguous virtual space. It needs global - synchronization to unmap. - - -Cost of Temporary Mappings -========================== - -The cost of creating temporary mappings can be quite high. The arch has to -manipulate the kernel's page tables, the data TLB and/or the MMU's registers. - -If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping -simply with a bit of arithmetic that will convert the page struct address into -a pointer to the page contents rather than juggling mappings about. In such a -case, the unmap operation may be a null operation. - -If CONFIG_MMU is not set, then there can be no temporary mappings and no -highmem. In such a case, the arithmetic approach will also be used. - - -i386 PAE -======== - -The i386 arch, under some circumstances, will permit you to stick up to 64GiB -of RAM into your 32-bit machine. This has a number of consequences: - -* Linux needs a page-frame structure for each page in the system and the - pageframes need to live in the permanent mapping, which means: - -* you can have 896M/sizeof(struct page) page-frames at most; with struct - page being 32-bytes that would end up being something in the order of 112G - worth of pages; the kernel, however, needs to store more than just - page-frames in that memory... - -* PAE makes your page tables larger - which slows the system down as more - data has to be accessed to traverse in TLB fills and the like. One - advantage is that PAE has more PTE bits and can provide advanced features - like NX and PAT. - -The general recommendation is that you don't use more than 8GiB on a 32-bit -machine - although more might work for you and your workload, you're pretty -much on your own - don't expect kernel developers to really care much if things -come apart. - - -Functions -========= - -.. kernel-doc:: include/linux/highmem.h -.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst deleted file mode 100644 index f2a59ed82ed3..000000000000 --- a/Documentation/vm/hmm.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _hmm: - -===================================== -Heterogeneous Memory Management (HMM) -===================================== - -Provide infrastructure and helpers to integrate non-conventional memory (device -memory like GPU on board memory) into regular kernel path, with the cornerstone -of this being specialized struct page for such memory (see sections 5 to 7 of -this document). - -HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., -allowing a device to transparently access program addresses coherently with -the CPU meaning that any valid pointer on the CPU is also a valid pointer -for the device. This is becoming mandatory to simplify the use of advanced -heterogeneous computing where GPU, DSP, or FPGA are used to perform various -computations on behalf of a process. - -This document is divided as follows: in the first section I expose the problems -related to using device specific memory allocators. In the second section, I -expose the hardware limitations that are inherent to many platforms. The third -section gives an overview of the HMM design. The fourth section explains how -CPU page-table mirroring works and the purpose of HMM in this context. The -fifth section deals with how device memory is represented inside the kernel. -Finally, the last section presents a new migration helper that allows -leveraging the device DMA engine. - -.. contents:: :local: - -Problems of using a device specific memory allocator -==================================================== - -Devices with a large amount of on board memory (several gigabytes) like GPUs -have historically managed their memory through dedicated driver specific APIs. -This creates a disconnect between memory allocated and managed by a device -driver and regular application memory (private anonymous, shared memory, or -regular file backed memory). From here on I will refer to this aspect as split -address space. I use shared address space to refer to the opposite situation: -i.e., one in which any application memory region can be used by a device -transparently. - -Split address space happens because devices can only access memory allocated -through a device specific API. This implies that all memory objects in a program -are not equal from the device point of view which complicates large programs -that rely on a wide set of libraries. - -Concretely, this means that code that wants to leverage devices like GPUs needs -to copy objects between generically allocated memory (malloc, mmap private, mmap -share) and memory allocated through the device driver API (this still ends up -with an mmap but of the device file). - -For flat data sets (array, grid, image, ...) this isn't too hard to achieve but -for complex data sets (list, tree, ...) it's hard to get right. Duplicating a -complex data set needs to re-map all the pointer relations between each of its -elements. This is error prone and programs get harder to debug because of the -duplicate data set and addresses. - -Split address space also means that libraries cannot transparently use data -they are getting from the core program or another library and thus each library -might have to duplicate its input data set using the device specific memory -allocator. Large projects suffer from this and waste resources because of the -various memory copies. - -Duplicating each library API to accept as input or output memory allocated by -each device specific allocator is not a viable option. It would lead to a -combinatorial explosion in the library entry points. - -Finally, with the advance of high level language constructs (in C++ but in -other languages too) it is now possible for the compiler to leverage GPUs and -other devices without programmer knowledge. Some compiler identified patterns -are only do-able with a shared address space. It is also more reasonable to use -a shared address space for all other patterns. - - -I/O bus, device memory characteristics -====================================== - -I/O buses cripple shared address spaces due to a few limitations. Most I/O -buses only allow basic memory access from device to main memory; even cache -coherency is often optional. Access to device memory from a CPU is even more -limited. More often than not, it is not cache coherent. - -If we only consider the PCIE bus, then a device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However, it only allows -a limited set of atomic operations from the device on main memory. This is worse -in the other direction: the CPU can only access a limited range of the device -memory and cannot perform atomic operations on it. Thus device memory cannot -be considered the same as regular memory from the kernel point of view. - -Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). -The final limitation is latency. Access to main memory from the device has an -order of magnitude higher latency than when the device accesses its own memory. - -Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow -two-way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Sadly, not all platforms are following this trend and -some major architectures are left without hardware solutions to these problems. - -So for shared address space to make sense, not only must we allow devices to -access any memory but we must also permit any memory to be migrated to device -memory while the device is using it (blocking CPU access while it happens). - - -Shared address space and migration -================================== - -HMM intends to provide two main features. The first one is to share the address -space by duplicating the CPU page table in the device page table so the same -address points to the same physical memory for any valid main memory address in -the process address space. - -To achieve this, HMM offers a set of helpers to populate the device page table -while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table, you must -allocate a buffer (or use a pool of pre-allocated buffers) and write GPU -specific commands in it to perform the update (unmap, cache invalidations, and -flush, ...). This cannot be done through common code for all devices. Hence -why HMM provides helpers to factor out everything that can be while leaving the -hardware specific details to the device driver. - -The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that -allows allocating a struct page for each page of device memory. Those pages -are special because the CPU cannot map them. However, they allow migrating -main memory to device memory using existing migration mechanisms and everything -looks like a page that is swapped out to disk from the CPU point of view. Using a -struct page gives the easiest and cleanest integration with existing mm -mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE -memory for the device memory and second to perform migration. Policy decisions -of what and when to migrate is left to the device driver. - -Note that any CPU access to a device page triggers a page fault and a migration -back to main memory. For example, when a page backing a given CPU address A is -migrated from a main memory page to a device page, then any CPU access to -address A triggers a page fault and initiates a migration back to main memory. - -With these two features, HMM not only allows a device to mirror process address -space and keeps both CPU and device page tables synchronized, but also -leverages device memory by migrating the part of the data set that is actively being -used by the device. - - -Address space mirroring implementation and API -============================================== - -Address space mirroring's main objective is to allow duplication of a range of -CPU page table into a device page table; HMM helps keep both synchronized. A -device driver that wants to mirror a process address space must start with the -registration of a mmu_interval_notifier:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -During the ops->invalidate() callback the device driver must perform the -update action to the range (mark range read only, or fully unmap, etc.). The -device must complete the update before the driver callback returns. - -When the device driver wants to populate a range of virtual addresses, it can -use:: - - int hmm_range_fault(struct hmm_range *range); - -It will trigger a page fault on missing or read-only entries if write access is -requested (see below). Page faults use the generic mm page fault code path just -like a CPU page fault. - -Both functions copy CPU page table entries into their pfns array argument. Each -entry in that array corresponds to an address in the virtual range. HMM -provides a set of flags to help the driver identify special CPU page table -entries. - -Locking within the sync_cpu_device_pagetables() callback is the most important -aspect the driver must respect in order to keep things properly synchronized. -The usage pattern is:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -The driver->update lock is the same lock that the driver takes inside its -invalidate() callback. That lock must be held before calling -mmu_interval_read_retry() to avoid any race with a concurrent CPU page table -update. - -Leverage default_flags and pfn_flags_mask -========================================= - -The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify -fault or snapshot policy for the whole range instead of having to set them -for each entry in the pfns array. - -For instance if the device driver wants pages for a range with at least read -permission, it sets:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -and calls hmm_range_fault() as described above. This will fill fault all pages -in the range with at least read permission. - -Now let's say the driver wants to do the same except for one page in the range for -which it wants to have write permission. Now driver set:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -With this, HMM will fault in all pages with at least read (i.e., valid) and for the -address == range->start + (index_of_write << PAGE_SHIFT) it will fault with -write permission i.e., if the CPU pte does not have write permission set then HMM -will call handle_mm_fault(). - -After hmm_range_fault completes the flag bits are set to the current state of -the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is -writable. - - -Represent and manage device memory from core kernel point of view -================================================================= - -Several different designs were tried to support device memory. The first one -used a device specific data structure to keep information about migrated memory -and HMM hooked itself in various places of mm code to handle any access to -addresses that were backed by device memory. It turns out that this ended up -replicating most of the fields of struct page and also needed many kernel code -paths to be updated to understand this new kind of memory. - -Most kernel code paths never try to access the memory behind a page -but only care about struct page contents. Because of this, HMM switched to -directly using struct page for device memory which left most kernel code paths -unaware of the difference. We only need to make sure that no one ever tries to -map those pages from the CPU side. - -Migration to and from device memory -=================================== - -Because the CPU cannot access device memory directly, the device driver must -use hardware DMA or device specific load/store instructions to migrate data. -The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() -functions are designed to make drivers easier to write and to centralize common -code across drivers. - -Before migrating pages to device private memory, special device private -``struct page`` need to be created. These will be used as special "swap" -page table entries so that a CPU process will fault if it tries to access -a page that has been migrated to device private memory. - -These can be allocated and freed with:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -There are also devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages(), and devm_release_mem_region() when the resources can -be tied to a ``struct device``. - -The overall migration steps are similar to migrating NUMA pages within system -memory (see :ref:`Page migration `) but the steps are split -between device driver specific code and shared common code: - -1. ``mmap_read_lock()`` - - The device driver has to pass a ``struct vm_area_struct`` to - migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to - be held for the duration of the migration. - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - The device driver initializes the ``struct migrate_vma`` fields and passes - the pointer to migrate_vma_setup(). The ``args->flags`` field is used to - filter which source pages should be migrated. For example, setting - ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in - device private memory. If the latter flag is set, the ``args->pgmap_owner`` - field is used to identify device private pages owned by the driver. This - avoids trying to migrate device private pages residing in other devices. - Currently only anonymous private VMA ranges can be migrated to or from - system memory and device private memory. - - One of the first steps migrate_vma_setup() does is to invalidate other - device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and - ``mmu_notifier_invalidate_range_end()`` calls around the page table - walks to fill in the ``args->src`` array with PFNs to be migrated. - The ``invalidate_range_start()`` callback is passed a - ``struct mmu_notifier_range`` with the ``event`` field set to - ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to - the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is - allows the device driver to skip the invalidation callback and only - invalidate device private MMU mappings that are actually migrating. - This is explained more in the next section. - - While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` - entry results in a valid "zero" PFN stored in the ``args->src`` array. - This lets the driver allocate device private memory and clear it instead - of copying a page of zeros. Valid PTE entries to system memory or - device private struct pages will be locked with ``lock_page()``, isolated - from the LRU (if system memory since device private pages are not on - the LRU), unmapped from the process, and a special migration PTE is - inserted in place of the original PTE. - migrate_vma_setup() also clears the ``args->dst`` array. - -3. The device driver allocates destination pages and copies source pages to - destination pages. - - The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` - bit is set and skips entries that are not migrating. The device driver - can also choose to skip migrating a page by not filling in the ``dst`` - array for that page. - - The driver then allocates either a device private struct page or a - system memory page, locks the page with ``lock_page()``, and fills in the - ``dst`` array entry with:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - Now that the driver knows that this page is being migrated, it can - invalidate device private MMU mappings and copy device private memory - to system memory or another device private page. The core Linux kernel - handles CPU page table invalidations so the device driver only has to - invalidate its own MMU mappings. - - The driver can use ``migrate_pfn_to_page(src[i])`` to get the - ``struct page`` of the source and either copy the source page to the - destination or clear the destination device private memory if the pointer - is ``NULL`` meaning the source page was not populated in system memory. - -4. ``migrate_vma_pages()`` - - This step is where the migration is actually "committed". - - If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this - is where the newly allocated page is inserted into the CPU's page table. - This can fail if a CPU thread faults on the same page. However, the page - table is locked and only one of the new pages will be inserted. - The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared - if it loses the race. - - If the source page was locked, isolated, etc. the source ``struct page`` - information is now copied to destination ``struct page`` finalizing the - migration on the CPU side. - -5. Device driver updates device MMU page tables for pages still migrating, - rolling back pages not migrating. - - If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device - driver can update the device MMU and set the write enable bit if the - ``MIGRATE_PFN_WRITE`` bit is set. - -6. ``migrate_vma_finalize()`` - - This step replaces the special migration page table entry with the new - page's page table entry and releases the reference to the source and - destination ``struct page``. - -7. ``mmap_read_unlock()`` - - The lock can now be released. - -Exclusive access memory -======================= - -Some devices have features such as atomic PTE bits that can be used to implement -atomic access to system memory. To support atomic operations to a shared virtual -memory page such a device needs access to that page which is exclusive of any -userspace access from the CPU. The ``make_device_exclusive_range()`` function -can be used to make a memory range inaccessible from userspace. - -This replaces all mappings for pages in the given range with special swap -entries. Any attempt to access the swap entry results in a fault which is -resovled by replacing the entry with the original mapping. A driver gets -notified that the mapping has been changed by MMU notifiers, after which point -it will no longer have exclusive access to the page. Exclusive access is -guranteed to last until the driver drops the page lock and page reference, at -which point any CPU faults on the page may proceed as described. - -Memory cgroup (memcg) and rss accounting -======================================== - -For now, device memory is accounted as any regular page in rss counters (either -anonymous if device page is used for anonymous, file if device page is used for -file backed page, or shmem if device page is used for shared memory). This is a -deliberate choice to keep existing applications, that might start using device -memory without knowing about it, running unimpacted. - -A drawback is that the OOM killer might kill an application using a lot of -device memory and not a lot of regular system memory and thus not freeing much -system memory. We want to gather more real world experience on how applications -and system react under memory pressure in the presence of device memory before -deciding to account device memory differently. - - -Same decision was made for memory cgroup. Device memory pages are accounted -against same memory cgroup a regular page would be accounted to. This does -simplify migration to and from device memory. This also means that migration -back from device memory to regular memory cannot fail because it would -go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is used and its impact on memory -resource control. - - -Note that device memory can never be pinned by a device driver nor through GUP -and thus such memory is always free upon process exit. Or when last reference -is dropped in case of shared memory or file backed memory. diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst deleted file mode 100644 index f143954e0d05..000000000000 --- a/Documentation/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,596 +0,0 @@ -.. _hugetlbfs_reserve: - -===================== -Hugetlbfs Reservation -===================== - -Overview -======== - -Huge pages as described at :ref:`hugetlbpage` are typically -preallocated for application use. These huge pages are instantiated in a -task's address space at page fault time if the VMA indicates huge pages are -to be used. If no huge page exists at page fault time, the task is sent -a SIGBUS and often dies an unhappy death. Shortly after huge page support -was added, it was determined that it would be better to detect a shortage -of huge pages at mmap() time. The idea is that if there were not enough -huge pages to cover the mapping, the mmap() would fail. This was first -done with a simple check in the code at mmap() time to determine if there -were enough free huge pages to cover the mapping. Like most things in the -kernel, the code has evolved over time. However, the basic idea was to -'reserve' huge pages at mmap() time to ensure that huge pages would be -available for page faults in that mapping. The description below attempts to -describe how huge page reserve processing is done in the v4.10 kernel. - - -Audience -======== -This description is primarily targeted at kernel developers who are modifying -hugetlbfs code. - - -The Data Structures -=================== - -resv_huge_pages - This is a global (per-hstate) count of reserved huge pages. Reserved - huge pages are only available to the task which reserved them. - Therefore, the number of huge pages generally available is computed - as (``free_huge_pages - resv_huge_pages``). -Reserve Map - A reserve map is described by the structure:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - There is one reserve map for each huge page mapping in the system. - The regions list within the resv_map describes the regions within - the mapping. A region is described as:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - The 'from' and 'to' fields of the file region structure are huge page - indices into the mapping. Depending on the type of mapping, a - region in the reserv_map may indicate reservations exist for the - range, or reservations do not exist. -Flags for MAP_PRIVATE Reservations - These are stored in the bottom bits of the reservation map pointer. - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - Indicates this task is the owner of the reservations - associated with the mapping. - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - Indicates task originally mapping this range (and creating - reserves) has unmapped a page from this task (the child) - due to a failed COW. -Page Flags - The PagePrivate page flag is used to indicate that a huge page - reservation must be restored when the huge page is freed. More - details will be discussed in the "Freeing huge pages" section. - - -Reservation Map Location (Private or Shared) -============================================ - -A huge page mapping or segment is either private or shared. If private, -it is typically only available to a single address space (task). If shared, -it can be mapped into multiple address spaces (tasks). The location and -semantics of the reservation map is significantly different for the two types -of mappings. Location differences are: - -- For private mappings, the reservation map hangs off the VMA structure. - Specifically, vma->vm_private_data. This reserve map is created at the - time the mapping (mmap(MAP_PRIVATE)) is created. -- For shared mappings, the reservation map hangs off the inode. Specifically, - inode->i_mapping->private_data. Since shared mappings are always backed - by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode - contains a reservation map. As a result, the reservation map is allocated - when the inode is created. - - -Creating Reservations -===================== -Reservations are created when a huge page backed shared memory segment is -created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). -These operations result in a call to the routine hugetlb_reserve_pages():: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -The first thing hugetlb_reserve_pages() does is check if the NORESERVE -flag was specified in either the shmget() or mmap() call. If NORESERVE -was specified, then this routine returns immediately as no reservations -are desired. - -The arguments 'from' and 'to' are huge page indices into the mapping or -underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to -the length of the segment/mapping. For mmap(), the offset argument could -be used to specify the offset into the underlying file. In such a case, -the 'from' and 'to' arguments have been adjusted by this offset. - -One of the big differences between PRIVATE and SHARED mappings is the way -in which reservations are represented in the reservation map. - -- For shared mappings, an entry in the reservation map indicates a reservation - exists or did exist for the corresponding page. As reservations are - consumed, the reservation map is not modified. -- For private mappings, the lack of an entry in the reservation map indicates - a reservation exists for the corresponding page. As reservations are - consumed, entries are added to the reservation map. Therefore, the - reservation map can also be used to determine which reservations have - been consumed. - -For private mappings, hugetlb_reserve_pages() creates the reservation map and -hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set -to indicate this VMA owns the reservations. - -The reservation map is consulted to determine how many huge page reservations -are needed for the current mapping/segment. For private mappings, this is -always the value (to - from). However, for shared mappings it is possible that -some reservations may already exist within the range (to - from). See the -section :ref:`Reservation Map Modifications ` -for details on how this is accomplished. - -The mapping may be associated with a subpool. If so, the subpool is consulted -to ensure there is sufficient space for the mapping. It is possible that the -subpool has set aside reservations that can be used for the mapping. See the -section :ref:`Subpool Reservations ` for more details. - -After consulting the reservation map and subpool, the number of needed new -reservations is known. The routine hugetlb_acct_memory() is called to check -for and take the requested number of reservations. hugetlb_acct_memory() -calls into routines that potentially allocate and adjust surplus page counts. -However, within those routines the code is simply checking to ensure there -are enough free huge pages to accommodate the reservation. If there are, -the global reservation count resv_huge_pages is adjusted something like the -following:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -Note that the global lock hugetlb_lock is held when checking and adjusting -these counters. - -If there were enough free huge pages and the global count resv_huge_pages -was adjusted, then the reservation map associated with the mapping is -modified to reflect the reservations. In the case of a shared mapping, a -file_region will exist that includes the range 'from' - 'to'. For private -mappings, no modifications are made to the reservation map as lack of an -entry indicates a reservation exists. - -If hugetlb_reserve_pages() was successful, the global reservation count and -reservation map associated with the mapping will be modified as required to -ensure reservations exist for the range 'from' - 'to'. - -.. _consume_resv: - -Consuming Reservations/Allocating a Huge Page -============================================= - -Reservations are consumed when huge pages associated with the reservations -are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page is passed a VMA pointer and a virtual address, so it can -consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves -should not be used even if it appears they have been set aside for the -specified address. The avoid_reserve argument is most often used in the case -of Copy on Write and Page Migration where additional copies of an existing -page are being allocated. - -The helper routine vma_needs_reservation() is called to determine if a -reservation exists for the address within the mapping(vma). See the section -:ref:`Reservation Map Helper Routines ` for detailed -information on what this routine does. -The value returned from vma_needs_reservation() is generally -0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. -If a reservation does not exist, and there is a subpool associated with the -mapping the subpool is consulted to determine if it contains reservations. -If the subpool contains reservations, one can be used for this allocation. -However, in every case the avoid_reserve argument overrides the use of -a reservation for the allocation. After determining whether a reservation -exists and can be used for the allocation, the routine dequeue_huge_page_vma() -is called. This routine takes two arguments related to reservations: - -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() -- chg, even though this argument is of type long only the values 0 or 1 are - passed to dequeue_huge_page_vma. If the value is 0, it indicates a - reservation exists (see the section "Memory Policy and Reservations" for - possible issues). If the value is 1, it indicates a reservation does not - exist and the page must be taken from the global free pool if possible. - -The free lists associated with the memory policy of the VMA are searched for -a free page. If a page is found, the value free_huge_pages is decremented -when the page is removed from the free list. If there was a reservation -associated with the page, the following adjustments are made:: - - SetPagePrivate(page); /* Indicates allocating this page consumed - * a reservation, and if an error is - * encountered such that the page must be - * freed, the reservation will be restored. */ - resv_huge_pages--; /* Decrement the global reservation count */ - -Note, if no huge page can be found that satisfies the VMA's memory policy -an attempt will be made to allocate one using the buddy allocator. This -brings up the issue of surplus huge pages and overcommit which is beyond -the scope reservations. Even if a surplus page is allocated, the same -reservation based adjustments as above will be made: SetPagePrivate(page) and -resv_huge_pages--. - -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. - -The routine vma_commit_reservation() is then called to adjust the reserve -map based on the consumption of the reservation. In general, this involves -ensuring the page is represented within a file_region structure of the region -map. For shared mappings where the reservation was present, an entry -in the reserve map already existed so no change is made. However, if there -was no reservation in a shared mapping or this was a private mapping a new -entry must be created. - -It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would -be possible if hugetlb_reserve_pages was called for the same page in a shared -mapping. In such cases, the reservation count and subpool free page count -will be off by one. This rare condition can be identified by comparing the -return value from vma_needs_reservation and vma_commit_reservation. If such -a race is detected, the subpool and global reserve counts are adjusted to -compensate. See the section -:ref:`Reservation Map Helper Routines ` for more -information on these routines. - - -Instantiate Huge Pages -====================== - -After huge page allocation, the page is typically added to the page tables -of the allocating task. Before this, pages in a shared mapping are added -to the page cache and pages in private mappings are added to an anonymous -reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, -when a huge page that has been instantiated is freed no adjustment is made -to the global reservation count (resv_huge_pages). - - -Freeing Huge Pages -================== - -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. - -The page->private field points to any subpool associated with the page. -If the PagePrivate flag is set, it indicates the global reserve count should -be adjusted (see the section -:ref:`Consuming Reservations/Allocating a Huge Page ` -for information on how these are set). - -The routine first calls hugepage_subpool_put_pages() for the page. If this -routine returns a value of 0 (which does not equal the value passed 1) it -indicates reserves are associated with the subpool, and this newly free page -must be used to keep the number of subpool reserves above the minimum size. -Therefore, the global resv_huge_pages counter is incremented in this case. - -If the PagePrivate flag was set in the page, the global resv_huge_pages counter -will always be incremented. - -.. _sub_pool_resv: - -Subpool Reservations -==================== - -There is a struct hstate associated with each huge page size. The hstate -tracks all huge pages of the specified size. A subpool represents a subset -of pages within a hstate that is associated with a mounted hugetlbfs -filesystem. - -When a hugetlbfs filesystem is mounted a min_size option can be specified -which indicates the minimum number of huge pages required by the filesystem. -If this option is specified, the number of huge pages corresponding to -min_size are reserved for use by the filesystem. This number is tracked in -the min_hpages field of a struct hugepage_subpool. At mount time, -hugetlb_acct_memory(min_hpages) is called to reserve the specified number of -huge pages. If they can not be reserved, the mount fails. - -The routines hugepage_subpool_get/put_pages() are called when pages are -obtained from or released back to a subpool. They perform all subpool -accounting, and track any reservations associated with the subpool. -hugepage_subpool_get/put_pages are passed the number of huge pages by which -to adjust the subpool 'used page' count (down for get, up for put). Normally, -they return the same value that was passed or an error if not enough pages -exist in the subpool. - -However, if reserves are associated with the subpool a return value less -than the passed value may be returned. This return value indicates the -number of additional global pool adjustments which must be made. For example, -suppose a subpool contains 3 reserved huge pages and someone asks for 5. -The 3 reserved pages associated with the subpool can be used to satisfy part -of the request. But, 2 pages must be obtained from the global pools. To -relay this information to the caller, the value 2 is returned. The caller -is then responsible for attempting to obtain the additional two pages from -the global pools. - - -COW and Reservations -==================== - -Since shared mappings all point to and use the same underlying pages, the -biggest reservation concern for COW is private mappings. In this case, -two tasks can be pointing at the same previously allocated page. One task -attempts to write to the page, so a new page must be allocated so that each -task points to its own page. - -When the page was originally allocated, the reservation for that page was -consumed. When an attempt to allocate a new page is made as a result of -COW, it is possible that no free huge pages are free and the allocation -will fail. - -When the private mapping was originally created, the owner of the mapping -was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation -map of the owner. Since the owner created the mapping, the owner owns all -the reservations associated with the mapping. Therefore, when a write fault -occurs and there is no page available, different action is taken for the owner -and non-owner of the reservation. - -In the case where the faulting task is not the owner, the fault will fail and -the task will typically receive a SIGBUS. - -If the owner is the faulting task, we want it to succeed since it owned the -original reservation. To accomplish this, the page is unmapped from the -non-owning task. In this way, the only reference is from the owning task. -In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer -of the non-owning task. The non-owning task may receive a SIGBUS if it later -faults on a non-present page. But, the original owner of the -mapping/reservation will behave as expected. - - -.. _resv_map_modifications: - -Reservation Map Modifications -============================= - -The following low level routines are used to make modifications to a -reservation map. Typically, these routines are not called directly. Rather, -a reservation map helper routine is called which calls one of these low level -routines. These low level routines are fairly well documented in the source -code (mm/hugetlb.c). These routines are:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -Operations on the reservation map typically involve two operations: - -1) region_chg() is called to examine the reserve map and determine how - many pages in the specified range [f, t) are NOT currently represented. - - The calling code performs global checks and allocations to determine if - there are enough huge pages for the operation to succeed. - -2) - a) If the operation can succeed, region_add() is called to actually modify - the reservation map for the same range [f, t) previously passed to - region_chg(). - b) If the operation can not succeed, region_abort is called for the same - range [f, t) to abort the operation. - -Note that this is a two step process where region_add() and region_abort() -are guaranteed to succeed after a prior call to region_chg() for the same -range. region_chg() is responsible for pre-allocating any data structures -necessary to ensure the subsequent operations (specifically region_add())) -will succeed. - -As mentioned above, region_chg() determines the number of pages in the range -which are NOT currently represented in the map. This number is returned to -the caller. region_add() returns the number of pages in the range added to -the map. In most cases, the return value of region_add() is the same as the -return value of region_chg(). However, in the case of shared mappings it is -possible for changes to the reservation map to be made between the calls to -region_chg() and region_add(). In this case, the return value of region_add() -will not match the return value of region_chg(). It is likely that in such -cases global counts and subpool accounting will be incorrect and in need of -adjustment. It is the responsibility of the caller to check for this condition -and make the appropriate adjustments. - -The routine region_del() is called to remove regions from a reservation map. -It is typically called in the following situations: - -- When a file in the hugetlbfs filesystem is being removed, the inode will - be released and the reservation map freed. Before freeing the reservation - map, all the individual file_region structures must be freed. In this case - region_del is passed the range [0, LONG_MAX). -- When a hugetlbfs file is being truncated. In this case, all allocated pages - after the new file size must be freed. In addition, any file_region entries - in the reservation map past the new end of file must be deleted. In this - case, region_del is passed the range [new_end_of_file, LONG_MAX). -- When a hole is being punched in a hugetlbfs file. In this case, huge pages - are removed from the middle of the file one at a time. As the pages are - removed, region_del() is called to remove the corresponding entry from the - reservation map. In this case, region_del is passed the range - [page_idx, page_idx + 1). - -In every case, region_del() will return the number of pages removed from the -reservation map. In VERY rare cases, region_del() can fail. This can only -happen in the hole punch case where it has to split an existing file_region -entry and can not allocate a new structure. In this error case, region_del() -will return -ENOMEM. The problem here is that the reservation map will -indicate that there is a reservation for the page. However, the subpool and -global reservation counts will not reflect the reservation. To handle this -situation, the routine hugetlb_fix_reserve_counts() is called to adjust the -counters so that they correspond with the reservation map entry that could -not be deleted. - -region_count() is called when unmapping a private huge page mapping. In -private mappings, the lack of a entry in the reservation map indicates that -a reservation exists. Therefore, by counting the number of entries in the -reservation map we know how many reservations were consumed and how many are -outstanding (outstanding = (end - start) - region_count(resv, start, end)). -Since the mapping is going away, the subpool and global reservation counts -are decremented by the number of outstanding reservations. - -.. _resv_map_helpers: - -Reservation Map Helper Routines -=============================== - -Several helper routines exist to query and modify the reservation maps. -These routines are only interested with reservations for a specific huge -page, so they just pass in an address instead of a range. In addition, -they pass in the associated VMA. From the VMA, the type of mapping (private -or shared) and the location of the reservation map (inode or VMA) can be -determined. These routines simply call the underlying routines described -in the section "Reservation Map Modifications". However, they do take into -account the 'opposite' meaning of reservation map entries for private and -shared mappings and hide this detail from the caller:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This routine calls region_chg() for the specified page. If no reservation -exists, 1 is returned. If a reservation exists, 0 is returned:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_add() for the specified page. As in the case of region_chg -and region_add, this routine is to be called after a previous call to -vma_needs_reservation. It will add a reservation entry for the page. It -returns 1 if the reservation was added and 0 if not. The return value should -be compared with the return value of the previous call to -vma_needs_reservation. An unexpected difference indicates the reservation -map was modified between calls:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_abort() for the specified page. As in the case of region_chg -and region_abort, this routine is to be called after a previous call to -vma_needs_reservation. It will abort/end the in progress reservation add -operation:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This is a special wrapper routine to help facilitate reservation cleanup -on error paths. It is only called from the routine restore_reserve_on_error(). -This routine is used in conjunction with vma_needs_reservation in an attempt -to add a reservation to the reservation map. It takes into account the -different reservation map semantics for private and shared mappings. Hence, -region_add is called for shared mappings (as an entry present in the map -indicates a reservation), and region_del is called for private mappings (as -the absence of an entry in the map indicates a reservation). See the section -"Reservation cleanup in error paths" for more information on what needs to -be done on error paths. - - -Reservation Cleanup in Error Paths -================================== - -As mentioned in the section -:ref:`Reservation Map Helper Routines `, reservation -map modifications are performed in two steps. First vma_needs_reservation -is called before a page is allocated. If the allocation is successful, -then vma_commit_reservation is called. If not, vma_end_reservation is called. -Global and subpool reservation counts are adjusted based on success or failure -of the operation and all is well. - -Additionally, after a huge page is instantiated the PagePrivate flag is -cleared so that accounting when the page is ultimately freed is correct. - -However, there are several instances where errors are encountered after a huge -page is allocated but before it is instantiated. In this case, the page -allocation has consumed the reservation and made the appropriate subpool, -reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page -will increment the global reservation count. However, the reservation map -indicates the reservation was consumed. This resulting inconsistent state -will cause the 'leak' of a reserved huge page. The global reserve count will -be higher than it should and prevent allocation of a pre-allocated page. - -The routine restore_reserve_on_error() attempts to handle this situation. It -is fairly well documented. The intention of this routine is to restore -the reservation map to the way it was before the page allocation. In this -way, the state of the reservation map will correspond to the global reservation -count after the page is freed. - -The routine restore_reserve_on_error itself may encounter errors while -attempting to restore the reservation map entry. In this case, it will -simply clear the PagePrivate flag of the page. In this way, the global -reserve count will not be incremented when the page is freed. However, the -reservation map will continue to look as though the reservation was consumed. -A page can still be allocated for the address, but it will not use a reserved -page as originally intended. - -There is some code (most notably userfaultfd) which can not call -restore_reserve_on_error. In this case, it simply modifies the PagePrivate -so that a reservation will not be leaked when the huge page is freed. - - -Reservations and Memory Policy -============================== -Per-node huge page lists existed in struct hstate when git was first used -to manage Linux code. The concept of reservations was added some time later. -When reservations were added, no attempt was made to take memory policy -into account. While cpusets are not exactly the same as memory policy, this -comment in hugetlb_acct_memory sums up the interaction between reservations -and cpusets/memory policy:: - - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - -Huge page reservations were added to prevent unexpected page allocation -failures (OOM) at page fault time. However, if an application makes use -of cpusets or memory policy there is no guarantee that huge pages will be -available on the required nodes. This is true even if there are a sufficient -number of global reservations. - -Hugetlbfs regression testing -============================ - -The most complete set of hugetlb tests are in the libhugetlbfs repository. -If you modify any hugetlb related code, use the libhugetlbfs test suite -to check for regressions. In addition, if you add any new hugetlb -functionality, please add appropriate tests to libhugetlbfs. - --- -Mike Kravetz, 7 April 2017 diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst deleted file mode 100644 index b9d5253c1305..000000000000 --- a/Documentation/vm/hwpoison.rst +++ /dev/null @@ -1,184 +0,0 @@ -.. hwpoison: - -======== -hwpoison -======== - -What is hwpoison? -================= - -Upcoming Intel CPUs have support for recovering from some memory errors -(``MCA recovery``). This requires the OS to declare a page "poisoned", -kill the processes associated with it and avoid using it in the future. - -This patchkit implements the necessary infrastructure in the VM. - -To quote the overview comment:: - - High level machine check handler. Handles pages reported by the - hardware as being corrupted usually due to a 2bit ECC memory or cache - failure. - - This focusses on pages detected as corrupted in the background. - When the current CPU tries to consume corruption the currently - running process can just be killed directly instead. This implies - that if the error cannot be handled for some reason it's safe to - just ignore it because no corruption has been consumed yet. Instead - when that happens another machine check will happen. - - Handles page cache pages in various states. The tricky part - here is that we can access any page asynchronous to other VM - users, because memory failures could happen anytime and anywhere, - possibly violating some of their assumptions. This is why this code - has to be extremely careful. Generally it tries to use normal locking - rules, as in get the standard locks, even if that means the - error handling takes potentially a long time. - - Some of the operations here are somewhat inefficient and have non - linear algorithmic complexity, because the data structures have not - been optimized for this case. This is in particular the case - for the mapping from a vma to a process. Since this case is expected - to be rare we hope we can get away with this. - -The code consists of a the high level handler in mm/memory-failure.c, -a new page poison bit and various checks in the VM to handle poisoned -pages. - -The main target right now is KVM guests, but it works for all kinds -of applications. KVM support requires a recent qemu-kvm release. - -For the KVM use there was need for a new signal type so that -KVM can inject the machine check into the guest with the proper -address. This in theory allows other applications to handle -memory failures too. The expection is that near all applications -won't do that, but some very specialized ones might. - -Failure recovery modes -====================== - -There are two (actually three) modes memory failure recovery can be in: - -vm.memory_failure_recovery sysctl set to zero: - All memory failures cause a panic. Do not attempt recovery. - -early kill - (can be controlled globally and per process) - Send SIGBUS to the application as soon as the error is detected - This allows applications who can process memory errors in a gentle - way (e.g. drop affected object) - This is the mode used by KVM qemu. - -late kill - Send SIGBUS when the application runs into the corrupted page. - This is best for memory error unaware applications and default - Note some pages are always handled as late kill. - -User control -============ - -vm.memory_failure_recovery - See sysctl.txt - -vm.memory_failure_early_kill - Enable early kill mode globally - -PR_MCE_KILL - Set early/late kill mode/revert to system default - - arg1: PR_MCE_KILL_CLEAR: - Revert to system default - arg1: PR_MCE_KILL_SET: - arg2 defines thread specific mode - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - Use system global default - - Note that if you want to have a dedicated thread which handles - the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should - call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, - the SIGBUS is sent to the main thread. - -PR_MCE_KILL_GET - return current mode - -Testing -======= - -* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the - process for testing - -* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` - - corrupt-pfn - Inject hwpoison fault at PFN echoed into this file. This does - some early filtering to avoid corrupted unintended pages in test suites. - - unpoison-pfn - Software-unpoison page at PFN echoed into this file. This way - a page can be reused again. This only works for Linux - injected failures, not for real memory failures. Once any hardware - memory failure happens, this feature is disabled. - - Note these injection interfaces are not stable and might change between - kernel versions - - corrupt-filter-dev-major, corrupt-filter-dev-minor - Only handle memory failures to pages associated with the file - system defined by block device major/minor. -1U is the - wildcard value. This should be only used for testing with - artificial injection. - - corrupt-filter-memcg - Limit injection to pages owned by memgroup. Specified by inode - number of the memcg. - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - When specified, only poison pages if ((page_flags & mask) == - value). This allows stress testing of many kinds of - pages. The page_flags are the same as in /proc/kpageflags. The - flag bits are defined in include/linux/kernel-page-flags.h and - documented in Documentation/admin-guide/mm/pagemap.rst - -* Architecture specific MCE injector - - x86 has mce-inject, mce-test - - Some portable hwpoison test programs in mce-test, see below. - -References -========== - -http://halobates.de/mce-lc09-2.pdf - Overview presentation from LinuxCon 09 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - Test suite (hwpoison specific portable tests in tsrc) - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86 specific injector - - -Limitations -=========== -- Not all page types are supported and never will. Most kernel internal - objects cannot be recovered, only LRU pages for now. - ---- -Andi Kleen, Oct 2009 diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst deleted file mode 100644 index 575ccd40e30c..000000000000 --- a/Documentation/vm/index.rst +++ /dev/null @@ -1,68 +0,0 @@ -===================================== -Linux Memory Management Documentation -===================================== - -Memory Management Guide -======================= - -This is a guide to understanding the memory management subsystem -of Linux. If you are looking for advice on simply allocating memory, -see the :ref:`memory_allocation`. For controlling and tuning guides, -see the :doc:`admin guide <../admin-guide/mm/index>`. - -.. toctree:: - :maxdepth: 1 - - physical_memory - page_tables - process_addrs - bootmem - page_allocation - vmalloc - slab - highmem - page_reclaim - swap - page_cache - shmfs - oom - -Legacy Documentation -==================== - -This is a collection of older documents about the Linux memory management -(MM) subsystem internals with different level of details ranging from -notes and mailing list responses for elaborating descriptions of data -structures and algorithms. It should all be integrated nicely into the -above structured documentation, or deleted if it has served its purpose. - -.. toctree:: - :maxdepth: 1 - - active_mm - arch_pgtable_helpers - balance - damon/index - free_page_reporting - frontswap - hmm - hwpoison - hugetlbfs_reserv - ksm - memory-model - mmu_notifier - numa - overcommit-accounting - page_migration - page_frags - page_owner - page_table_check - remap_file_pages - slub - split_page_table_lock - transhuge - unevictable-lru - vmalloced-kernel-stacks - vmemmap_dedup - z3fold - zsmalloc diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst deleted file mode 100644 index 9e37add068e6..000000000000 --- a/Documentation/vm/ksm.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. _ksm: - -======================= -Kernel Samepage Merging -======================= - -KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, -added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, -and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ - -The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` - -Design -====== - -Overview --------- - -.. kernel-doc:: mm/ksm.c - :DOC: Overview - -Reverse mapping ---------------- -KSM maintains reverse mapping information for KSM pages in the stable -tree. - -If a KSM page is shared between less than ``max_page_sharing`` VMAs, -the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the -KSM page points to the stable tree node. - -When the sharing passes this threshold, KSM adds a second dimension to -the stable tree. The tree node becomes a "chain" that links one or -more "dups". Each "dup" keeps reverse mapping information for a KSM -page with ``page->mapping`` pointing to that "dup". - -Every "chain" and all "dups" linked into a "chain" enforce the -invariant that they represent the same write protected memory content, -even if each "dup" will be pointed by a different KSM page copy of -that content. - -This way the stable tree lookup computational complexity is unaffected -if compared to an unlimited list of reverse mappings. It is still -enforced that there cannot be KSM page content duplicates in the -stable tree itself. - -The deduplication limit enforced by ``max_page_sharing`` is required -to avoid the virtual memory rmap lists to grow too large. The rmap -walk has O(N) complexity where N is the number of rmap_items -(i.e. virtual mappings) that are sharing the page, which is in turn -capped by ``max_page_sharing``. So this effectively spreads the linear -O(N) computational complexity from rmap walk context over different -KSM pages. The ksmd walk over the stable_node "chains" is also O(N), -but N is the number of stable_node "dups", not the number of -rmap_items, so it has not a significant impact on ksmd performance. In -practice the best stable_node "dup" candidate will be kept and found -at the head of the "dups" list. - -High values of ``max_page_sharing`` result in faster memory merging -(because there will be fewer stable_node dups queued into the -stable_node chain->hlist to check for pruning) and higher -deduplication factor at the expense of slower worst case for rmap -walks for any KSM page which can happen during swapping, compaction, -NUMA balancing and page migration. - -The ``stable_node_dups/stable_node_chains`` ratio is also affected by the -``max_page_sharing`` tunable, and an high ratio may indicate fragmentation -in the stable_node dups, which could be solved by introducing -fragmentation algorithms in ksmd which would refile rmap_items from -one stable_node dup to another stable_node dup, in order to free up -stable_node "dups" with few rmap_items in them, but that may increase -the ksmd CPU usage and possibly slowdown the readonly computations on -the KSM pages of the applications. - -The whole list of stable_node "dups" linked in the stable_node -"chains" is scanned periodically in order to prune stale stable_nodes. -The frequency of such scans is defined by -``stable_node_chains_prune_millisecs`` sysfs tunable. - -Reference ---------- -.. kernel-doc:: mm/ksm.c - :functions: mm_slot ksm_scan stable_node rmap_item - --- -Izik Eidus, -Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst deleted file mode 100644 index 30e8fbed6914..000000000000 --- a/Documentation/vm/memory-model.rst +++ /dev/null @@ -1,177 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _physical_memory_model: - -===================== -Physical Memory Model -===================== - -Physical memory in a system may be addressed in different ways. The -simplest case is when the physical memory starts at address 0 and -spans a contiguous range up to the maximal address. It could be, -however, that this range contains small holes that are not accessible -for the CPU. Then there could be several contiguous ranges at -completely distinct addresses. And, don't forget about NUMA, where -different memory banks are attached to different CPUs. - -Linux abstracts this diversity using one of the two memory models: -FLATMEM and SPARSEMEM. Each architecture defines what -memory models it supports, what the default memory model is and -whether it is possible to manually override that default. - -All the memory models track the status of physical page frames using -struct page arranged in one or more arrays. - -Regardless of the selected memory model, there exists one-to-one -mapping between the physical page frame number (PFN) and the -corresponding `struct page`. - -Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` -helpers that allow the conversion from PFN to `struct page` and vice -versa. - -FLATMEM -======= - -The simplest memory model is FLATMEM. This model is suitable for -non-NUMA systems with contiguous, or mostly contiguous, physical -memory. - -In the FLATMEM memory model, there is a global `mem_map` array that -maps the entire physical memory. For most architectures, the holes -have entries in the `mem_map` array. The `struct page` objects -corresponding to the holes are never fully initialized. - -To allocate the `mem_map` array, architecture specific setup code should -call :c:func:`free_area_init` function. Yet, the mappings array is not -usable until the call to :c:func:`memblock_free_all` that hands all the -memory to the page allocator. - -An architecture may free parts of the `mem_map` array that do not cover the -actual physical pages. In such case, the architecture specific -:c:func:`pfn_valid` implementation should take the holes in the -`mem_map` into account. - -With FLATMEM, the conversion between a PFN and the `struct page` is -straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the -`mem_map` array. - -The `ARCH_PFN_OFFSET` defines the first page frame number for -systems with physical memory starting at address different from 0. - -SPARSEMEM -========= - -SPARSEMEM is the most versatile memory model available in Linux and it -is the only memory model that supports several advanced features such -as hot-plug and hot-remove of the physical memory, alternative memory -maps for non-volatile memory devices and deferred initialization of -the memory map for larger systems. - -The SPARSEMEM model presents the physical memory as a collection of -sections. A section is represented with struct mem_section -that contains `section_mem_map` that is, logically, a pointer to an -array of struct pages. However, it is stored with some other magic -that aids the sections management. The section size and maximal number -of section is specified using `SECTION_SIZE_BITS` and -`MAX_PHYSMEM_BITS` constants defined by each architecture that -supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a -physical address that an architecture supports, the -`SECTION_SIZE_BITS` is an arbitrary value. - -The maximal number of sections is denoted `NR_MEM_SECTIONS` and -defined as - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -The `mem_section` objects are arranged in a two-dimensional array -called `mem_sections`. The size and placement of this array depend -on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of -sections: - -* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` - array is static and has `NR_MEM_SECTIONS` rows. Each row holds a - single `mem_section` object. -* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` - array is dynamically allocated. Each row contains PAGE_SIZE worth of - `mem_section` objects and the number of rows is calculated to fit - all the memory sections. - -The architecture setup code should call sparse_init() to -initialize the memory sections and the memory maps. - -With SPARSEMEM there are two possible ways to convert a PFN to the -corresponding `struct page` - a "classic sparse" and "sparse -vmemmap". The selection is made at build time and it is determined by -the value of `CONFIG_SPARSEMEM_VMEMMAP`. - -The classic sparse encodes the section number of a page in page->flags -and uses high bits of a PFN to access the section that maps that page -frame. Inside a section, the PFN is the index to the array of pages. - -The sparse vmemmap uses a virtually mapped memory map to optimize -pfn_to_page and page_to_pfn operations. There is a global `struct -page *vmemmap` pointer that points to a virtually contiguous array of -`struct page` objects. A PFN is an index to that array and the -offset of the `struct page` from `vmemmap` is the PFN of that -page. - -To use vmemmap, an architecture has to reserve a range of virtual -addresses that will map the physical pages containing the memory -map and make sure that `vmemmap` points to that range. In addition, -the architecture should implement :c:func:`vmemmap_populate` method -that will allocate the physical memory and create page tables for the -virtual memory map. If an architecture does not have any special -requirements for the vmemmap mappings, it can use default -:c:func:`vmemmap_populate_basepages` provided by the generic memory -management. - -The virtually mapped memory map allows storing `struct page` objects -for persistent memory devices in pre-allocated storage on those -devices. This storage is represented with struct vmem_altmap -that is eventually passed to vmemmap_populate() through a long chain -of function calls. The vmemmap_populate() implementation may use the -`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to -allocate memory map on the persistent memory device. - -ZONE_DEVICE -=========== -The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer -`struct page` `mem_map` services for device driver identified physical -address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact -that the page objects for these address ranges are never marked online, -and that a reference must be taken against the device, not just the page -to keep the memory pinned for active use. `ZONE_DEVICE`, via -:c:func:`devm_memremap_pages`, performs just enough memory hotplug to -turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and -:c:func:`get_user_pages` service for the given range of pfns. Since the -page reference count never drops below 1 the page is never tracked as -free memory and the page's `struct list_head lru` space is repurposed -for back referencing to the host device / driver that mapped the memory. - -While `SPARSEMEM` presents memory as a collection of sections, -optionally collected into memory blocks, `ZONE_DEVICE` users have a need -for smaller granularity of populating the `mem_map`. Given that -`ZONE_DEVICE` memory is never marked online it is subsequently never -subject to its memory ranges being exposed through the sysfs memory -hotplug api on memory block boundaries. The implementation relies on -this lack of user-api constraint to allow sub-section sized memory -ranges to be specified to :c:func:`arch_add_memory`, the top-half of -memory hotplug. Sub-section support allows for 2MB as the cross-arch -common alignment granularity for :c:func:`devm_memremap_pages`. - -The users of `ZONE_DEVICE` are: - -* pmem: Map platform persistent memory to be used as a direct-I/O target - via DAX mappings. - -* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` - event callbacks to allow a device-driver to coordinate memory management - events related to device-memory, typically GPU memory. See - Documentation/vm/hmm.rst. - -* p2pdma: Create `struct page` objects to allow peer devices in a - PCI/-E topology to coordinate direct-DMA operations between themselves, - i.e. bypass host memory. diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst deleted file mode 100644 index df5d7777fc6b..000000000000 --- a/Documentation/vm/mmu_notifier.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _mmu_notifier: - -When do you need to notify inside page table lock ? -=================================================== - -When clearing a pte/pmd we are given a choice to notify the event through -(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under -the page table lock. But that notification is not necessary in all cases. - -For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use -thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a -process virtual address space). There is only 2 cases when you need to notify -those secondary TLB while holding page table lock when clearing a pte/pmd: - - A) page backing address is free before mmu_notifier_invalidate_range_end() - B) a page table entry is updated to point to a new page (COW, write fault - on zero page, __replace_page(), ...) - -Case A is obvious you do not want to take the risk for the device to write to -a page that might now be used by some completely different task. - -Case B is more subtle. For correctness it requires the following sequence to -happen: - - - take page table lock - - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) - - set page table entry to point to new page - -If clearing the page table entry is not followed by a notify before setting -the new pte/pmd value then you can break memory model like C11 or C++11 for -the device. - -Consider the following scenario (device use a feature similar to ATS/PASID): - -Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume -they are write protected for COW (other case of B apply too). - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {try to write to addrA} - CPU-thread-1 {try to write to addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA and populate device TLB} - DEV-thread-2 {read addrB and populate device TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} - CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {write to addrA which is a write to new page} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {write to addrB which is a write to new page} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA from old page} - DEV-thread-2 {read addrB from new page} - -So here because at time N+2 the clear page table entry was not pair with a -notification to invalidate the secondary TLB, the device see the new value for -addrB before seeing the new value for addrA. This break total memory ordering -for the device. - -When changing a pte to write protect or to point to a new write protected page -with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range -call to mmu_notifier_invalidate_range_end() outside the page table lock. This -is true even if the thread doing the page table update is preempted right after -releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst deleted file mode 100644 index 99fdeca917ca..000000000000 --- a/Documentation/vm/numa.rst +++ /dev/null @@ -1,150 +0,0 @@ -.. _numa: - -Started Nov 1999 by Kanoj Sarcar - -============= -What is NUMA? -============= - -This question can be answered from a couple of perspectives: the -hardware view and the Linux software view. - -From the hardware perspective, a NUMA system is a computer platform that -comprises multiple components or assemblies each of which may contain 0 -or more CPUs, local memory, and/or IO buses. For brevity and to -disambiguate the hardware view of these physical components/assemblies -from the software abstraction thereof, we'll call the components/assemblies -'cells' in this document. - -Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset -of the system--although some components necessary for a stand-alone SMP system -may not be populated on any given cell. The cells of the NUMA system are -connected together with some sort of system interconnect--e.g., a crossbar or -point-to-point link are common types of NUMA system interconnects. Both of -these types of interconnects can be aggregated to create NUMA platforms with -cells at multiple distances from other cells. - -For Linux, the NUMA platforms of interest are primarily what is known as Cache -Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible -to and accessible from any CPU attached to any cell and cache coherency -is handled in hardware by the processor caches and/or the system interconnect. - -Memory access time and effective memory bandwidth varies depending on how far -away the cell containing the CPU or IO bus making the memory access is from the -cell containing the target memory. For example, access to memory by CPUs -attached to the same cell will experience faster access times and higher -bandwidths than accesses to memory on other, remote cells. NUMA platforms -can have cells at multiple remote distances from any given cell. - -Platform vendors don't build NUMA systems just to make software developers' -lives interesting. Rather, this architecture is a means to provide scalable -memory bandwidth. However, to achieve scalable memory bandwidth, system and -application software must arrange for a large majority of the memory references -[cache misses] to be to "local" memory--memory on the same cell, if any--or -to the closest cell with memory. - -This leads to the Linux software view of a NUMA system: - -Linux divides the system's hardware resources into multiple software -abstractions called "nodes". Linux maps the nodes onto the physical cells -of the hardware platform, abstracting away some of the details for some -architectures. As with physical cells, software nodes may contain 0 or more -CPUs, memory and/or IO buses. And, again, memory accesses to memory on -"closer" nodes--nodes that map to closer cells--will generally experience -faster access times and higher effective bandwidth than accesses to more -remote cells. - -For some architectures, such as x86, Linux will "hide" any node representing a -physical cell that has no memory attached, and reassign any CPUs attached to -that cell to a node representing a cell that does have memory. Thus, on -these architectures, one cannot assume that all CPUs that Linux associates with -a given node will see the same local memory access times and bandwidth. - -In addition, for some architectures, again x86 is an example, Linux supports -the emulation of additional nodes. For NUMA emulation, linux will carve up -the existing nodes--or the system memory for non-NUMA platforms--into multiple -nodes. Each emulated node will manage a fraction of the underlying cells' -physical memory. NUMA emluation is useful for testing NUMA kernel and -application features on non-NUMA platforms, and as a sort of memory resource -management mechanism when used together with cpusets. -[see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -For each node with memory, Linux constructs an independent memory management -subsystem, complete with its own free page lists, in-use page lists, usage -statistics and locks to mediate access. In addition, Linux constructs for -each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], -an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a -selected zone/node cannot satisfy the allocation request. This situation, -when a zone has no available memory to satisfy a request, is called -"overflow" or "fallback". - -Because some nodes contain multiple zones containing different types of -memory, Linux must decide whether to order the zonelists such that allocations -fall back to the same zone type on a different node, or to a different zone -type on the same node. This is an important consideration because some zones, -such as DMA or DMA32, represent relatively scarce resources. Linux chooses -a default Node ordered zonelist. This means it tries to fallback to other zones -from the same node before using remote nodes which are ordered by NUMA distance. - -By default, Linux will attempt to satisfy memory allocation requests from the -node to which the CPU that executes the request is assigned. Specifically, -Linux will attempt to allocate from the first node in the appropriate zonelist -for the node where the request originates. This is called "local allocation." -If the "local" node cannot satisfy the request, the kernel will examine other -nodes' zones in the selected zonelist looking for the first zone in the list -that can satisfy the request. - -Local allocation will tend to keep subsequent access to the allocated memory -"local" to the underlying physical resources and off the system interconnect-- -as long as the task on whose behalf the kernel allocated some memory does not -later migrate away from that memory. The Linux scheduler is aware of the -NUMA topology of the platform--embodied in the "scheduling domains" data -structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler -attempts to minimize task migration to distant scheduling domains. However, -the scheduler does not take a task's NUMA footprint into account directly. -Thus, under sufficient imbalance, tasks can migrate between nodes, remote -from their initial node and kernel data structures. - -System administrators and application designers can restrict a task's migration -to improve NUMA locality using various CPU affinity command line interfaces, -such as taskset(1) and numactl(1), and program interfaces such as -sched_setaffinity(2). Further, one can modify the kernel's default local -allocation behavior using Linux NUMA memory policy. [see -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -System administrators can restrict the CPUs and nodes' memories that a non- -privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -On architectures that do not hide memoryless nodes, Linux will include only -zones [nodes] with memory in the zonelists. This means that for a memoryless -node the "local memory node"--the node of the first zone in CPU's node's -zonelist--will not be the node itself. Rather, it will be the node that the -kernel selected as the nearest node with memory when it built the zonelists. -So, default, local allocations will succeed with the kernel supplying the -closest available memory. This is a consequence of the same mechanism that -allows such allocations to fallback to other nearby nodes when a node that -does contain memory overflows. - -Some kernel allocations do not want or cannot tolerate this allocation fallback -behavior. Rather they want to be sure they get memory from the specified node -or get notified that the node has no free memory. This is usually the case when -a subsystem allocates per CPU memory resources, for example. - -A typical model for making such an allocation is to obtain the node id of the -node to which the "current CPU" is attached using one of the kernel's -numa_node_id() or CPU_to_node() functions and then request memory from only -the node id returned. When such an allocation fails, the requesting subsystem -may revert to its own fallback path. The slab kernel memory allocator is an -example of this. Or, the subsystem may choose to disable or not to enable -itself on allocation failure. The kernel profiling subsystem is an example of -this. - -If the architecture supports--does not hide--memoryless nodes, then CPUs -attached to memoryless nodes would always incur the fallback path overhead -or some subsystems would fail to initialize if they attempted to allocated -memory exclusively from a node without memory. To support such -architectures transparently, kernel subsystems can use the numa_mem_id() -or cpu_to_mem() function to locate the "local memory node" for the calling or -specified CPU. Again, this is the same node from which default, local page -allocations will be attempted. diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst deleted file mode 100644 index 18e9e40c1ec1..000000000000 --- a/Documentation/vm/oom.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -Out Of Memory Handling -====================== diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst deleted file mode 100644 index 1addb0c374a4..000000000000 --- a/Documentation/vm/overcommit-accounting.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _overcommit_accounting: - -===================== -Overcommit Accounting -===================== - -The Linux kernel supports the following overcommit handling modes - -0 - Heuristic overcommit handling. Obvious overcommits of address - space are refused. Used for a typical system. It ensures a - seriously wild allocation fails while allowing overcommit to - reduce swap usage. root is allowed to allocate slightly more - memory in this mode. This is the default. - -1 - Always overcommit. Appropriate for some scientific - applications. Classic example is code using sparse arrays and - just relying on the virtual memory consisting almost entirely - of zero pages. - -2 - Don't overcommit. The total address space commit for the - system is not permitted to exceed swap + a configurable amount - (default is 50%) of physical RAM. Depending on the amount you - use, in most situations this means a process will not be - killed while accessing pages but will receive errors on memory - allocation as appropriate. - - Useful for applications that want to guarantee their memory - allocations will be available in the future without having to - initialize every page. - -The overcommit policy is set via the sysctl ``vm.overcommit_memory``. - -The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) -or ``vm.overcommit_kbytes`` (absolute value). These only have an effect -when ``vm.overcommit_memory`` is set to 2. - -The current overcommit limit and amount committed are viewable in -``/proc/meminfo`` as CommitLimit and Committed_AS respectively. - -Gotchas -======= - -The C language stack growth does an implicit mremap. If you want absolute -guarantees and run close to the edge you MUST mmap your stack for the -largest size you think you will need. For typical stack usage this does -not matter much but it's a corner case if you really really care - -In mode 2 the MAP_NORESERVE flag is ignored. - - -How It Works -============ - -The overcommit is based on the following rules - -For a file backed map - | SHARED or READ-only - 0 cost (the file is the map not swap) - | PRIVATE WRITABLE - size of mapping per instance - -For an anonymous or ``/dev/zero`` map - | SHARED - size of mapping - | PRIVATE READ-only - 0 cost (but of little use) - | PRIVATE WRITABLE - size of mapping per instance - -Additional accounting - | Pages made writable copies by mmap - | shmfs memory drawn from the same pool - -Status -====== - -* We account mmap memory mappings -* We account mprotect changes in commit -* We account mremap changes in size -* We account brk -* We account munmap -* We report the commit status in /proc -* Account and check on fork -* Review stack handling/building on exec -* SHMfs accounting -* Implement actual limit enforcement - -To Do -===== -* Account ptrace pages (this is hard) diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst deleted file mode 100644 index d9b4495561f1..000000000000 --- a/Documentation/vm/page_allocation.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Page Allocation -=============== diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst deleted file mode 100644 index 75eba7c431b2..000000000000 --- a/Documentation/vm/page_cache.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========== -Page Cache -========== diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst deleted file mode 100644 index 7d6f9385d129..000000000000 --- a/Documentation/vm/page_frags.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. _page_frags: - -============== -Page fragments -============== - -A page fragment is an arbitrary-length arbitrary-offset area of memory -which resides within a 0 or higher order compound page. Multiple -fragments within that page are individually refcounted, in the page's -reference counter. - -The page_frag functions, page_frag_alloc and page_frag_free, provide a -simple allocation framework for page fragments. This is used by the -network stack and network device drivers to provide a backing region of -memory for use as either an sk_buff->head, or to be used in the "frags" -portion of skb_shared_info. - -In order to make use of the page fragment APIs a backing page fragment -cache is needed. This provides a central point for the fragment allocation -and tracks allows multiple calls to make use of a cached page. The -advantage to doing this is that multiple calls to get_page can be avoided -which can be expensive at allocation time. However due to the nature of -this caching it is required that any calls to the cache be protected by -either a per-cpu limitation, or a per-cpu limitation and forcing interrupts -to be disabled when executing the fragment allocation. - -The network stack uses two separate caches per CPU to handle fragment -allocation. The netdev_alloc_cache is used by callers making use of the -netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The -main difference between these two calls is the context in which they may be -called. The "netdev" prefixed functions are usable in any context as these -functions will disable interrupts, while the "napi" prefixed functions are -only usable within the softirq context. - -Many network device drivers use a similar methodology for allocating page -fragments, but the page fragments are cached at the ring or descriptor -level. In order to enable these cases it is necessary to provide a generic -way of tearing down a page cache. For this reason __page_frag_cache_drain -was implemented. It allows for freeing multiple references from a single -page via a single call. The advantage to doing this is that it allows for -cleaning up the multiple references that were added to a page in order to -avoid calling get_page per allocation. - -Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst deleted file mode 100644 index 8c5cb8147e55..000000000000 --- a/Documentation/vm/page_migration.rst +++ /dev/null @@ -1,288 +0,0 @@ -.. _page_migration: - -============== -Page migration -============== - -Page migration allows moving the physical location of pages between -nodes in a NUMA system while the process is running. This means that the -virtual addresses that the process sees do not change. However, the -system rearranges the physical location of those pages. - -Also see :ref:`Heterogeneous Memory Management (HMM) ` -for migrating pages to or from device private memory. - -The main intent of page migration is to reduce the latency of memory accesses -by moving pages near to the processor where the process accessing that memory -is running. - -Page migration allows a process to manually relocate the node on which its -pages are located through the MF_MOVE and MF_MOVE_ALL options while setting -a new memory policy via mbind(). The pages of a process can also be relocated -from another process using the sys_migrate_pages() function call. The -migrate_pages() function call takes two sets of nodes and moves pages of a -process that are located on the from nodes to the destination nodes. -Page migration functions are provided by the numactl package by Andi Kleen -(a version later than 0.9.3 is required. Get it from -https://github.com/numactl/numactl.git). numactl provides libnuma -which provides an interface similar to other NUMA functionality for page -migration. cat ``/proc//numa_maps`` allows an easy review of where the -pages of a process are located. See also the numa_maps documentation in the -proc(5) man page. - -Manual migration is useful if for example the scheduler has relocated -a process to a processor on a distant node. A batch scheduler or an -administrator may detect the situation and move the pages of the process -nearer to the new processor. The kernel itself only provides -manual page migration support. Automatic page migration may be implemented -through user space processes that move pages. A special function call -"move_pages" allows the moving of individual pages within a process. -For example, A NUMA profiler may obtain a log showing frequent off-node -accesses and may use the result to move pages to more advantageous -locations. - -Larger installations usually partition the system using cpusets into -sections of nodes. Paul Jackson has equipped cpusets with the ability to -move pages when a task is moved to another cpuset (See -:ref:`CPUSETS `). -Cpusets allow the automation of process locality. If a task is moved to -a new cpuset then also all its pages are moved with it so that the -performance of the process does not sink dramatically. Also the pages -of processes in a cpuset are moved if the allowed memory nodes of a -cpuset are changed. - -Page migration allows the preservation of the relative location of pages -within a group of nodes for all migration techniques which will preserve a -particular memory allocation pattern generated even after migrating a -process. This is necessary in order to preserve the memory latencies. -Processes will run with similar performance after migration. - -Page migration occurs in several steps. First a high level -description for those trying to use migrate_pages() from the kernel -(for userspace usage see the Andi Kleen's numactl package mentioned above) -and then a low level description of how the low level details work. - -In kernel use of migrate_pages() -================================ - -1. Remove pages from the LRU. - - Lists of pages to be migrated are generated by scanning over - pages and moving them into lists. This is done by - calling isolate_lru_page(). - Calling isolate_lru_page() increases the references to the page - so that it cannot vanish while the page migration occurs. - It also prevents the swapper or other scans from encountering - the page. - -2. We need to have a function of type new_page_t that can be - passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. - -3. The migrate_pages() function is called which attempts - to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. - -How migrate_pages() works -========================= - -migrate_pages() does several passes over its list of pages. A page is moved -if all references to a page are removable at the time. The page has -already been removed from the LRU via isolate_lru_page() and the refcount -is increased so that the page cannot be freed while page migration occurs. - -Steps: - -1. Lock the page to be migrated. - -2. Ensure that writeback is complete. - -3. Lock the new page that we want to move to. It is locked so that accesses to - this (not yet up-to-date) page immediately block while the move is in progress. - -4. All the page table references to the page are converted to migration - entries. This decreases the mapcount of a page. If the resulting - mapcount is not zero then we do not migrate the page. All user space - processes that attempt to access the page will now wait on the page lock - or wait for the migration page table entry to be removed. - -5. The i_pages lock is taken. This will cause all processes trying - to access the page via the mapping to block on the spinlock. - -6. The refcount of the page is examined and we back out if references remain. - Otherwise, we know that we are the only one referencing this page. - -7. The radix tree is checked and if it does not contain the pointer to this - page then we back out because someone else modified the radix tree. - -8. The new page is prepped with some settings from the old page so that - accesses to the new page will discover a page with the correct settings. - -9. The radix tree is changed to point to the new page. - -10. The reference count of the old page is dropped because the address space - reference is gone. A reference to the new page is established because - the new page is referenced by the address space. - -11. The i_pages lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the lock - to sleeping on the locked new page. - -12. The page contents are copied to the new page. - -13. The remaining page flags are copied to the new page. - -14. The old page flags are cleared to indicate that the page does - not provide any information anymore. - -15. Queued up writeback on the new page is triggered. - -16. If migration entries were inserted into the page table, then replace them - with real ptes. Doing so will enable access for user space processes not - already waiting for the page lock. - -17. The page locks are dropped from the old and new page. - Processes waiting on the page lock will redo their page faults - and will reach the new page. - -18. The new page is moved to the LRU and can be scanned by the swapper, - etc. again. - -Non-LRU page migration -====================== - -Although migration originally aimed for reducing the latency of memory accesses -for NUMA, compaction also uses migration to create high-order pages. - -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-LRU pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add their own specific hooks in the migration path. - -To overcome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -in the migration path. - -If a driver wants to make its pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects from isolate_page() function of driver is to return *true* - if driver isolates the page successfully. On returning true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in those fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage() of driver with the isolated page. - The function of migratepage() is to move the contents of the old page to the - new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returned - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporary migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without - retrying. - - Driver shouldn't touch the page.lru field while in the migratepage() function. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on the isolated page, VM should return the isolated page - to the driver so VM calls the driver's putback_page() with the isolated page. - In this function, the driver should put the isolated page back into its own data - structure. - -Non-LRU movable page flags - - There are two page flags for supporting non-LRU movable page. - - * PG_movable - - Driver should use the function below to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather, VM - reuses the page->mapping's lower bits to represent it:: - - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping() which masks off the low two bits of page->mapping under - page lock so it can get the right struct address_space. - - For testing of non-LRU movable pages, VM supports __PageMovable() function. - However, it doesn't guarantee to identify non-LRU movable pages because - the page->mapping field is unified with other variables in struct page. - If the driver releases the page after isolation by VM, page->mapping - doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set - (look at __ClearPageMovable). But __PageMovable() is cheap to call whether - page is LRU or non-LRU movable once the page has been isolated because LRU - pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also - good for just peeking to test non-LRU movable pages before more expensive - checking with lock_page() in pfn scanning to select a victim. - - For guaranteeing non-LRU movable page, VM provides PageMovable() function. - Unlike __PageMovable(), PageMovable() validates page->mapping and - mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents - sudden destroying of page->mapping. - - Drivers using __SetPageMovable() should clear the flag via - __ClearMovablePage() under page_lock() before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page(). So if a CPU encounters PG_isolated - non-LRU movable page, it can skip it. Driver doesn't need to manipulate the - flag because VM will set/clear it automatically. Keep in mind that if the - driver sees a PG_isolated page, it means the page has been isolated by the - VM so it shouldn't touch the page.lru field. - The PG_isolated flag is aliased with the PG_reclaim flag so drivers - shouldn't use PG_isolated for its own purposes. - -Monitoring Migration -===================== - -The following events (counters) can be used to monitor page migration. - -1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP and non-hugetlb page, then - this counter is increased by one. If the page was a THP or hugetlb, then - this counter is increased by the number of THP or hugetlb subpages. - For example, migration of a single 2MB THP that has 4KB-size base pages - (subpages) will cause this counter to increase by 512. - -2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for - PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP or hugetlb. - -3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. - -4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. - -5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had - to be split. After splitting, a migration retry was used for it's sub-pages. - -THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or -PGMIGRATE_FAIL events. For example, a THP migration failure will cause both -THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. - -Christoph Lameter, May 8, 2006. -Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst deleted file mode 100644 index f5c954afe97c..000000000000 --- a/Documentation/vm/page_owner.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. _page_owner: - -================================================== -page owner: Tracking about who allocated each page -================================================== - -Introduction -============ - -page owner is for the tracking about who allocated each page. -It can be used to debug memory leak or to find a memory hogger. -When allocation happens, information about allocation such as call stack -and order of pages is stored into certain storage for each page. -When we need to know about status of all pages, we can get and analyze -this information. - -Although we already have tracepoint for tracing page allocation/free, -using it for analyzing who allocate each page is rather complex. We need -to enlarge the trace buffer for preventing overlapping until userspace -program launched. And, launched program continually dump out the trace -buffer for later analysis and it would change system behaviour with more -possibility rather than just keeping it in memory, so bad for debugging. - -page owner can also be used for various purposes. For example, accurate -fragmentation statistics can be obtained through gfp flag information of -each page. It is already implemented and activated if page owner is -enabled. Other usages are more than welcome. - -page owner is disabled by default. So, if you'd like to use it, you need -to add "page_owner=on" to your boot cmdline. If the kernel is built -with page owner and page owner is disabled in runtime due to not enabling -boot option, runtime overhead is marginal. If disabled in runtime, it -doesn't require memory to store owner information, so there is no runtime -memory overhead. And, page owner inserts just two unlikely branches into -the page allocator hotpath and if not enabled, then allocation is done -like as the kernel without page owner. These two unlikely branches should -not affect to allocation performance, especially if the static keys jump -label patching functionality is available. Following is the kernel's code -size change due to this facility. - -- Without page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- With page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -Although, roughly, 8 KB code is added in total, page_alloc.o increase by -520 bytes and less than half of it is in hotpath. Building the kernel with -page owner and turning it on if needed would be great option to debug -kernel memory problem. - -There is one notice that is caused by implementation detail. page owner -stores information into the memory from struct page extension. This memory -is initialized some time later than that page allocator starts in sparse -memory system, so, until initialization, many pages can be allocated and -they would have no owner information. To fix it up, these early allocated -pages are investigated and marked as allocated in initialization phase. -Although it doesn't mean that they have the right owner information, -at least, we can tell whether the page is allocated or not, -more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages -are catched and marked, although they are mostly allocated from struct -page extension feature. Anyway, after that, no page is left in -un-tracking state. - -Usage -===== - -1) Build user-space helper:: - - cd tools/vm - make page_owner_sort - -2) Enable page owner: add "page_owner=on" to boot cmdline. - -3) Do the job that you want to debug. - -4) Analyze information from page owner:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - The general output of ``page_owner_full.txt`` is as follows:: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows - in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the parameter(s). - - See the result about who allocated each page - in the ``sorted_page_owner.txt``. General output:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the page nums of buf, use the ``-m`` parameter. - The detailed parameters are: - - fundamental function:: - - Sort: - -a Sort by memory allocation time. - -m Sort by total memory. - -p Sort by pid. - -P Sort by tgid. - -n Sort by task command name. - -r Sort by memory release time. - -s Sort by stack trace. - -t Sort by times (default). - --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. - Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is - optional since default direction is increasing numerical or lexicographic - order. Mixed use of abbreviated and complete-form of keys is allowed. - - Examples: - ./page_owner_sort --sort=n,+pid,-tgid - ./page_owner_sort --sort=at - - additional function:: - - Cull: - --cull - Specify culling rules.Culling syntax is key[,key[,...]].Choose a - multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. - - is a single argument in the form of a comma-separated list, - which offers a way to specify individual culling rules. The recognized - keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. - can be specified by the sequence of keys k1,k2, ..., as described in - the STANDARD SORT KEYS section below. Mixed use of abbreviated and - complete-form of keys is allowed. - - Examples: - ./page_owner_sort --cull=stacktrace - ./page_owner_sort --cull=st,pid,name - ./page_owner_sort --cull=n,f - - Filter: - -f Filter out the information of blocks whose memory has been released. - - Select: - --pid Select by pid. This selects the blocks whose process ID - numbers appear in . - --tgid Select by tgid. This selects the blocks whose thread - group ID numbers appear in . - --name Select by task command name. This selects the blocks whose - task command name appear in . - - , , are single arguments in the form of a comma-separated list, - which offers a way to specify individual selecting rules. - - - Examples: - ./page_owner_sort --pid=1 - ./page_owner_sort --tgid=1,2,3 - ./page_owner_sort --name name1,name2 - -STANDARD FORMAT SPECIFIERS -========================== -:: - - For --sort option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - st stacktrace stack trace of the page allocation - T txt full text of block - ft free_ts timestamp of the page when it was released - at alloc_ts timestamp of the page when it was allocated - ator allocator memory allocator for pages - - For --curl option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - f free whether the page has been released or not - st stacktrace stack trace of the page allocation - ator allocator memory allocator for pages diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst deleted file mode 100644 index 50a30b7f8ac3..000000000000 --- a/Documentation/vm/page_reclaim.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============ -Page Reclaim -============ diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst deleted file mode 100644 index 1a09472f10a3..000000000000 --- a/Documentation/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _page_table_check: - -================ -Page Table Check -================ - -Introduction -============ - -Page table check allows to harden the kernel by ensuring that some types of -the memory corruptions are prevented. - -Page table check performs extra verifications at the time when new pages become -accessible from the userspace by getting their page table entries (PTEs PMDs -etc.) added into the table. - -In case of detected corruption, the kernel is crashed. There is a small -performance and memory overhead associated with the page table check. Therefore, -it is disabled by default, but can be optionally enabled on systems where the -extra hardening outweighs the performance costs. Also, because page table check -is synchronous, it can help with debugging double map memory corruption issues, -by crashing kernel at the time wrong mapping occurs instead of later which is -often the case with memory corruptions bugs. - -Double mapping detection logic -============================== - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -Enabling Page Table Check -========================= - -Build kernel with: - -- PAGE_TABLE_CHECK=y - Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK - is available. - -- Boot with 'page_table_check=on' kernel parameter. - -Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page -table support without extra kernel parameter. diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst deleted file mode 100644 index 96939571d7bc..000000000000 --- a/Documentation/vm/page_tables.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Page Tables -=========== diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst deleted file mode 100644 index 2ab7b8c1c863..000000000000 --- a/Documentation/vm/physical_memory.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Physical Memory -=============== diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst deleted file mode 100644 index e8618fbc62c9..000000000000 --- a/Documentation/vm/process_addrs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================= -Process Addresses -================= diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst deleted file mode 100644 index 7bef6718e3a9..000000000000 --- a/Documentation/vm/remap_file_pages.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _remap_file_pages: - -============================== -remap_file_pages() system call -============================== - -The remap_file_pages() system call is used to create a nonlinear mapping, -that is, a mapping in which the pages of the file are mapped into a -nonsequential order in memory. The advantage of using remap_file_pages() -over using repeated calls to mmap(2) is that the former approach does not -require the kernel to create additional VMA (Virtual Memory Area) data -structures. - -Supporting of nonlinear mapping requires significant amount of non-trivial -code in kernel virtual memory subsystem including hot paths. Also to get -nonlinear mapping work kernel need a way to distinguish normal page table -entries from entries with file offset (pte_file). Kernel reserves flag in -PTE for this purpose. PTE flags are scarce resource especially on some CPU -architectures. It would be nice to free up the flag for other usage. - -Fortunately, there are not many users of remap_file_pages() in the wild. -It's only known that one enterprise RDBMS implementation uses the syscall -on 32-bit systems to map files bigger than can linearly fit into 32-bit -virtual address space. This use-case is not critical anymore since 64-bit -systems are widely available. - -The syscall is deprecated and replaced it with an emulation now. The -emulation creates new VMAs instead of nonlinear mappings. It's going to -work slower for rare users of remap_file_pages() but ABI is preserved. - -One side effect of emulation (apart from performance) is that user can hit -vm.max_map_count limit more easily due to additional VMAs. See comment for -DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst deleted file mode 100644 index 8b01ebb4c30e..000000000000 --- a/Documentation/vm/shmfs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================== -Shared Memory Filesystem -======================== diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst deleted file mode 100644 index 87d5a5bb172f..000000000000 --- a/Documentation/vm/slab.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Slab Allocation -=============== diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst deleted file mode 100644 index 43063ade737a..000000000000 --- a/Documentation/vm/slub.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _slub: - -========================== -Short users guide for SLUB -========================== - -The basic philosophy of SLUB is very different from SLAB. SLAB -requires rebuilding the kernel to activate debug options for all -slab caches. SLUB always includes full debugging but it is off by default. -SLUB can enable debugging only for selected slabs in order to avoid -an impact on overall system performance which may make a bug more -difficult to find. - -In order to switch debugging on one can add an option ``slub_debug`` -to the kernel command line. That will enable full debugging for -all slabs. - -Typically one would then use the ``slabinfo`` command to get statistical -data and perform operation on the slabs. By default ``slabinfo`` only lists -slabs that have data in them. See "slabinfo -h" for more options when -running the command. ``slabinfo`` can be compiled with -:: - - gcc -o slabinfo tools/vm/slabinfo.c - -Some of the modes of operation of ``slabinfo`` require that slub debugging -be enabled on the command line. F.e. no tracking information will be -available without debugging on and validation can only partially -be performed if debugging was not switched on. - -Some more sophisticated uses of slub_debug: -------------------------------------------- - -Parameters may be given to ``slub_debug``. If none is specified then full -debugging is enabled. Format: - -slub_debug= - Enable options for all slabs - -slub_debug=,,,... - Enable options only for select slabs (no spaces - after a comma) - -Multiple blocks of options for all slabs or selected slabs can be given, with -blocks of options delimited by ';'. The last of "all slabs" blocks is applied -to all slabs except those that match one of the "select slabs" block. Options -of the first "select slabs" blocks that matches the slab's name are applied. - -Possible debug options are:: - - F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS - Sorry SLAB legacy issues) - Z Red zoning - P Poisoning (object and padding) - U User tracking (free and alloc) - T Trace (please only use on single slabs) - A Enable failslab filter mark for the cache - O Switch debugging off for caches that would have - caused higher minimum slab orders - - Switch all debugging off (useful if the kernel is - configured with CONFIG_SLUB_DEBUG_ON) - -F.e. in order to boot just with sanity checks and red zoning one would specify:: - - slub_debug=FZ - -Trying to find an issue in the dentry cache? Try:: - - slub_debug=,dentry - -to only enable debugging on the dentry cache. You may use an asterisk at the -end of the slab name, in order to cover all slabs with the same prefix. For -example, here's how you can poison the dentry cache as well as all kmalloc -slabs:: - - slub_debug=P,kmalloc-*,dentry - -Red zoning and tracking may realign the slab. We can just apply sanity checks -to the dentry cache with:: - - slub_debug=F,dentry - -Debugging options may require the minimum possible slab order to increase as -a result of storing the metadata (for example, caches with PAGE_SIZE object -sizes). This has a higher liklihood of resulting in slab allocation errors -in low memory situations or if there's high fragmentation of memory. To -switch off debugging for such caches by default, use:: - - slub_debug=O - -You can apply different options to different list of slab names, using blocks -of options. This will enable red zoning for dentry and user tracking for -kmalloc. All other slabs will not get any debugging enabled:: - - slub_debug=Z,dentry;U,kmalloc-* - -You can also enable options (e.g. sanity checks and poisoning) for all caches -except some that are deemed too performance critical and don't need to be -debugged by specifying global debug options followed by a list of slab names -with "-" as options:: - - slub_debug=FZ;-,zs_handle,zspage - -The state of each debug option for a slab can be found in the respective files -under:: - - /sys/kernel/slab// - -If the file contains 1, the option is enabled, 0 means disabled. The debug -options from the ``slub_debug`` parameter translate to the following files:: - - F sanity_checks - Z red_zone - P poison - U store_user - T trace - A failslab - -Careful with tracing: It may spew out lots of information and never stop if -used on the wrong slab. - -Slab merging -============ - -If no debug options are specified then SLUB may merge similar slabs together -in order to reduce overhead and increase cache hotness of objects. -``slabinfo -a`` displays which slabs were merged together. - -Slab validation -=============== - -SLUB can validate all object if the kernel was booted with slub_debug. In -order to do so you must have the ``slabinfo`` tool. Then you can do -:: - - slabinfo -v - -which will test all objects. Output will be generated to the syslog. - -This also works in a more limited way if boot was without slab debug. -In that case ``slabinfo -v`` simply tests all reachable objects. Usually -these are in the cpu slabs and the partial slabs. Full slabs are not -tracked by SLUB in a non debug situation. - -Getting more performance -======================== - -To some degree SLUB's performance is limited by the need to take the -list_lock once in a while to deal with partial slabs. That overhead is -governed by the order of the allocation for each slab. The allocations -can be influenced by kernel parameters: - -.. slub_min_objects=x (default 4) -.. slub_min_order=x (default 0) -.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) - -``slub_min_objects`` - allows to specify how many objects must at least fit into one - slab in order for the allocation order to be acceptable. In - general slub will be able to perform this number of - allocations on a slab without consulting centralized resources - (list_lock) where contention may occur. - -``slub_min_order`` - specifies a minimum order of slabs. A similar effect like - ``slub_min_objects``. - -``slub_max_order`` - specified the order at which ``slub_min_objects`` should no - longer be checked. This is useful to avoid SLUB trying to - generate super large order pages to fit ``slub_min_objects`` - of a slab cache with large object sizes into one high order - page. Setting command line parameter - ``debug_guardpage_minorder=N`` (N > 0), forces setting - ``slub_max_order`` to 0, what cause minimum possible order of - slabs allocation. - -SLUB Debug output -================= - -Here is a sample of slub debug output:: - - ==================================================================== - BUG kmalloc-8: Right Redzone overwritten - -------------------------------------------------------------------- - - INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc - INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 - INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 - INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 - - Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ - Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 - Redzone (0xc90f6d28): 00 cc cc cc . - Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ - - [] dump_trace+0x63/0x1eb - [] show_trace_log_lvl+0x1a/0x2f - [] show_trace+0x12/0x14 - [] dump_stack+0x16/0x18 - [] object_err+0x143/0x14b - [] check_object+0x66/0x234 - [] __slab_free+0x239/0x384 - [] kfree+0xa6/0xc6 - [] get_modalias+0xb9/0xf5 - [] dmi_dev_uevent+0x27/0x3c - [] dev_uevent+0x1ad/0x1da - [] kobject_uevent_env+0x20a/0x45b - [] kobject_uevent+0xa/0xf - [] store_uevent+0x4f/0x58 - [] dev_attr_store+0x29/0x2f - [] sysfs_write_file+0x16e/0x19c - [] vfs_write+0xd1/0x15a - [] sys_write+0x3d/0x72 - [] sysenter_past_esp+0x5f/0x99 - [] 0xb7f7b410 - ======================= - - FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc - -If SLUB encounters a corrupted object (full detection requires the kernel -to be booted with slub_debug) then the following output will be dumped -into the syslog: - -1. Description of the problem encountered - - This will be a message in the system log starting with:: - - =============================================== - BUG : - ----------------------------------------------- - - INFO: - - INFO: Slab
- INFO: Object
- INFO: Allocated in age= cpu= pid= - INFO: Freed in age= cpu= - pid= - - (Object allocation / free information is only available if SLAB_STORE_USER is - set for the slab. slub_debug sets that option) - -2. The object contents if an object was involved. - - Various types of lines can follow the BUG SLUB line: - - Bytes b4
: - Shows a few bytes before the object where the problem was detected. - Can be useful if the corruption does not stop with the start of the - object. - - Object
: - The bytes of the object. If the object is inactive then the bytes - typically contain poison values. Any non-poison value shows a - corruption by a write after free. - - Redzone
: - The Redzone following the object. The Redzone is used to detect - writes after the object. All bytes should always have the same - value. If there is any deviation then it is due to a write after - the object boundary. - - (Redzone information is only available if SLAB_RED_ZONE is set. - slub_debug sets that option) - - Padding
: - Unused data to fill up the space in order to get the next object - properly aligned. In the debug case we make sure that there are - at least 4 bytes of padding. This allows the detection of writes - before the object. - -3. A stackdump - - The stackdump describes the location where the error was detected. The cause - of the corruption is may be more likely found by looking at the function that - allocated or freed the object. - -4. Report on how the problem was dealt with in order to ensure the continued - operation of the system. - - These are messages in the system log beginning with:: - - FIX : - - In the above sample SLUB found that the Redzone of an active object has - been overwritten. Here a string of 8 characters was written into a slab that - has the length of 8 characters. However, a 8 character string needs a - terminating 0. That zero has overwritten the first byte of the Redzone field. - After reporting the details of the issue encountered the FIX SLUB message - tells us that SLUB has restored the Redzone to its proper value and then - system operations continue. - -Emergency operations -==================== - -Minimal debugging (sanity checks alone) can be enabled by booting with:: - - slub_debug=F - -This will be generally be enough to enable the resiliency features of slub -which will keep the system running even if a bad kernel component will -keep corrupting objects. This may be important for production systems. -Performance will be impacted by the sanity checks and there will be a -continual stream of error messages to the syslog but no additional memory -will be used (unlike full debugging). - -No guarantees. The kernel component still needs to be fixed. Performance -may be optimized further by locating the slab that experiences corruption -and enabling debugging only for that cache - -I.e.:: - - slub_debug=F,dentry - -If the corruption occurs by writing after the end of the object then it -may be advisable to enable a Redzone to avoid corrupting the beginning -of other objects:: - - slub_debug=FZ,dentry - -Extended slabinfo mode and plotting -=================================== - -The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: - - Slabcache Totals - - Slabs sorted by size (up to -N slabs, default 1) - - Slabs sorted by loss (up to -N slabs, default 1) - -Additionally, in this mode ``slabinfo`` does not dynamically scale -sizes (G/M/K) and reports everything in bytes (this functionality is -also available to other slabinfo modes via '-B' option) which makes -reporting more precise and accurate. Moreover, in some sense the `-X' -mode also simplifies the analysis of slabs' behaviour, because its -output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it -pushes the analysis from looking through the numbers (tons of numbers) -to something easier -- visual analysis. - -To generate plots: - -a) collect slabinfo extended records, for example:: - - while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done - -b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: - - slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] - - The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records - and generates 3 png files (and 3 pre-processing cache files) per STATS - file: - - Slabcache Totals: FOO_STATS-totals.png - - Slabs sorted by size: FOO_STATS-slabs-by-size.png - - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png - -Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you -need to compare slabs' behaviour "prior to" and "after" some code -modification. To help you out there, ``slabinfo-gnuplot.sh`` script -can 'merge' the `Slabcache Totals` sections from different -measurements. To visually compare N plots: - -a) Collect as many STATS1, STATS2, .. STATSN files as you need:: - - while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done - -b) Pre-process those STATS files:: - - slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN - -c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the - generated pre-processed \*-totals:: - - slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals - - This will produce a single plot (png file). - - Plots, expectedly, can be large so some fluctuations or small spikes - can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two - options to 'zoom-in'/'zoom-out': - - a) ``-s %d,%d`` -- overwrites the default image width and height - b) ``-r %d,%d`` -- specifies a range of samples to use (for example, - in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r - 40,60`` range will plot only samples collected between 40th and - 60th seconds). - - -DebugFS files for SLUB -====================== - -For more information about current state of SLUB caches with the user tracking -debug option enabled, debugfs files are available, typically under -/sys/kernel/debug/slab// (created only for caches with enabled user -tracking). There are 2 types of these files with the following debug -information: - -1. alloc_traces:: - - Prints information about unique allocation traces of the currently - allocated objects. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, allocating function, minimal/average/maximal jiffies since alloc, - pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. - - Example::: - - 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: - __slab_alloc+0x6d/0x90 - kmem_cache_alloc_trace+0x2eb/0x300 - populate_error_injection_list+0x97/0x110 - init_error_injection+0x1b/0x71 - do_one_initcall+0x5f/0x2d0 - kernel_init_freeable+0x26f/0x2d7 - kernel_init+0xe/0x118 - ret_from_fork+0x22/0x30 - - -2. free_traces:: - - Prints information about unique freeing traces of the currently allocated - objects. The freeing traces thus come from the previous life-cycle of the - objects and are reported as not available for objects allocated for the first - time. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, freeing function, minimal/average/maximal jiffies since free, - pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. - - Example::: - - 1980 age=4294912290 pid=0 cpus=0 - 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 - kfree+0x2db/0x420 - acpi_ut_update_ref_count+0x6a6/0x782 - acpi_ut_update_object_reference+0x1ad/0x234 - acpi_ut_remove_reference+0x7d/0x84 - acpi_rs_get_prt_method_data+0x97/0xd6 - acpi_get_irq_routing_table+0x82/0xc4 - acpi_pci_irq_find_prt_entry+0x8e/0x2e0 - acpi_pci_irq_lookup+0x3a/0x1e0 - acpi_pci_irq_enable+0x77/0x240 - pcibios_enable_device+0x39/0x40 - do_pci_enable_device.part.0+0x5d/0xe0 - pci_enable_device_flags+0xfc/0x120 - pci_enable_device+0x13/0x20 - virtio_pci_probe+0x9e/0x170 - local_pci_probe+0x48/0x80 - pci_device_probe+0x105/0x1c0 - -Christoph Lameter, May 30, 2007 -Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst deleted file mode 100644 index c08919662704..000000000000 --- a/Documentation/vm/split_page_table_lock.rst +++ /dev/null @@ -1,100 +0,0 @@ -.. _split_page_table_lock: - -===================== -Split page table lock -===================== - -Originally, mm->page_table_lock spinlock protected all page tables of the -mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve -scalability, split page table lock was introduced. - -With split page table lock we have separate per-table lock to serialize -access to the table. At the moment we use split lock for PTE and PMD -tables. Access to higher level tables protected by mm->page_table_lock. - -There are helpers to lock/unlock a table and other accessor functions: - - - pte_offset_map_lock() - maps pte and takes PTE table lock, returns pointer to the taken - lock; - - pte_unmap_unlock() - unlocks and unmaps PTE table; - - pte_alloc_map_lock() - allocates PTE table if needed and take the lock, returns pointer - to taken lock or NULL if allocation failed; - - pte_lockptr() - returns pointer to PTE table lock; - - pmd_lock() - takes PMD table lock, returns pointer to taken lock; - - pmd_lockptr() - returns pointer to PMD table lock; - -Split page table lock for PTE tables is enabled compile-time if -CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. -If split lock is disabled, all tables are guarded by mm->page_table_lock. - -Split page table lock for PMD tables is enabled, if it's enabled for PTE -tables and the architecture supports it (see below). - -Hugetlb and split page table lock -================================= - -Hugetlb can support several page sizes. We use split lock only for PMD -level, but not for PUD. - -Hugetlb-specific helpers: - - - huge_pte_lock() - takes pmd split lock for PMD_SIZE page, mm->page_table_lock - otherwise; - - huge_pte_lockptr() - returns pointer to table lock; - -Support of split page table lock by an architecture -=================================================== - -There's no need in special enabling of PTE split page table lock: everything -required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which -must be called on PTE table allocation / freeing. - -Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache for its pages. -This field shares storage with page->ptl. - -PMD split lock only makes sense if you have more than two page table -levels. - -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table -allocation and pgtable_pmd_page_dtor() on freeing. - -Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and -pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing -paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). - -With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. - -NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must -be handled properly. - -page->ptl -========= - -page->ptl is used to access split page table lock, where 'page' is struct -page of page containing the table. It shares storage with page->private -(and few other fields in union). - -To avoid increasing size of struct page and have best performance, we use a -trick: - - - if spinlock_t fits into long, we use page->ptr as spinlock, so we - can avoid indirect access and save a cache line. - - if size of spinlock_t is bigger then size of long, we use page->ptl as - pointer to spinlock_t and allocate it dynamically. This allows to use - split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs - one more cache line for indirect access; - -The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in -pgtable_pmd_page_ctor() for PMD table. - -Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst deleted file mode 100644 index 78819bd4d745..000000000000 --- a/Documentation/vm/swap.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -==== -Swap -==== diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst deleted file mode 100644 index 216db1d67d04..000000000000 --- a/Documentation/vm/transhuge.rst +++ /dev/null @@ -1,187 +0,0 @@ -.. _transhuge: - -============================ -Transparent Hugepage Support -============================ - -This document describes design principles for Transparent Hugepage (THP) -support and its interaction with other parts of the memory management -system. - -Design principles -================= - -- "graceful fallback": mm components which don't have transparent hugepage - knowledge fall back to breaking huge pmd mapping into table of ptes and, - if necessary, split a transparent hugepage. Therefore these components - can continue working on the regular pages or regular pte mappings. - -- if a hugepage allocation fails because of memory fragmentation, - regular pages should be gracefully allocated instead and mixed in - the same vma without any failure or significant delay and without - userland noticing - -- if some task quits and more hugepages become available (either - immediately in the buddy or through the VM), guest physical memory - backed by regular pages should be relocated on hugepages - automatically (with khugepaged) - -- it doesn't require memory reservation and in turn it uses hugepages - whenever possible (the only possible reservation here is kernelcore= - to avoid unmovable pages to fragment all the memory but such a tweak - is not specific to transparent hugepage support and it's a generic - feature that applies to all dynamic high order allocations in the - kernel) - -get_user_pages and follow_page -============================== - -get_user_pages and follow_page if run on a hugepage, will return the -head or tail pages as usual (exactly as they would do on -hugetlbfs). Most GUP users will only care about the actual physical -address of the page and its temporary pinning to release after the I/O -is complete, so they won't ever notice the fact the page is huge. But -if any driver is going to mangle over the page structure of the tail -page (like for checking page->mapping or other bits that are relevant -for the head page and not the tail page), it should be updated to jump -to check head page instead. Taking a reference on any head/tail page would -prevent the page from being split by anyone. - -.. note:: - these aren't new constraints to the GUP API, and they match the - same constraints that apply to hugetlbfs too, so any driver capable - of handling GUP on hugetlbfs will also work fine on transparent - hugepage backed mappings. - -Graceful fallback -================= - -Code walking pagetables but unaware about huge pmds can simply call -split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by -pmd_offset. It's trivial to make the code transparent hugepage aware -by just grepping for "pmd_offset" and adding split_huge_pmd where -missing after pmd_offset returns the pmd. Thanks to the graceful -fallback design, with a one liner change, you can avoid to write -hundreds if not thousands of lines of complex code to make your code -hugepage aware. - -If you're not walking pagetables but you run into a physical hugepage -that you can't handle natively in your code, you can split it by -calling split_huge_page(page). This is what the Linux VM does before -it tries to swapout the hugepage for example. split_huge_page() can fail -if the page is pinned and you must handle this correctly. - -Example to make mremap.c transparent hugepage aware with a one liner -change:: - - diff --git a/mm/mremap.c b/mm/mremap.c - --- a/mm/mremap.c - +++ b/mm/mremap.c - @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru - return NULL; - - pmd = pmd_offset(pud, addr); - + split_huge_pmd(vma, pmd, addr); - if (pmd_none_or_clear_bad(pmd)) - return NULL; - -Locking in hugepage aware code -============================== - -We want as much code as possible hugepage aware, as calling -split_huge_page() or split_huge_pmd() has a cost. - -To make pagetable walks huge pmd aware, all you need to do is to call -pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_lock in read (or write) mode to be sure a huge pmd cannot be -created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_lock in write mode in addition to the anon_vma lock). If -pmd_trans_huge returns false, you just fallback in the old code -paths. If instead pmd_trans_huge returns true, you have to take the -page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the -page table lock will prevent the huge pmd being converted into a -regular pmd from under you (split_huge_pmd can run in parallel to the -pagetable walk). If the second pmd_trans_huge returns false, you -should just drop the page table lock and fallback to the old code as -before. Otherwise, you can proceed to process the huge pmd and the -hugepage natively. Once finished, you can drop the page table lock. - -Refcounts and transparent huge pages -==================================== - -Refcounting on THP is mostly consistent with refcounting on other compound -pages: - - - get_page()/put_page() and GUP operate on head page's ->_refcount. - - - ->_refcount in tail pages is always zero: get_page_unless_zero() never - succeeds on tail pages. - - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. - - - map/unmap of the whole compound page is accounted for in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. - -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This optimization is required to lower the overhead of per-subpage mapcount -tracking. The alternative is to alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page is split -for the first time, but still have a PMD mapping. The additional references -go away with the last compound_mapcount. - -File pages get PG_double_map set on the first map of the page with PTE and -goes away when the page gets evicted from the page cache. - -split_huge_page internally has to distribute the refcounts in the head -page to the tail pages before clearing all PG_head/tail bits from the page -structures. It can be done easily for refcounts taken by page table -entries, but we don't have enough information on how to distribute any -additional pins (i.e. from get_user_pages). split_huge_page() fails any -requests to split pinned huge pages: it expects page count to be equal to -the sum of mapcount of all sub-pages plus one (split_huge_page caller must -have a reference to the head page). - -split_huge_page uses migration entries to stabilize page->_refcount and -page->_mapcount of anonymous pages. File pages just get unmapped. - -We are safe against physical memory scanners too: the only legitimate way -a scanner can get a reference to a page is get_page_unless_zero(). - -All tail pages have zero ->_refcount until atomic_add(). This prevents the -scanner from getting a reference to the tail page up to that point. After the -atomic_add() we don't care about the ->_refcount value. We already know how -many references should be uncharged from the head page. - -For head page get_page_unless_zero() will succeed and we don't mind. It's -clear where references should go after split: it will stay on the head page. - -Note that split_huge_pmd() doesn't have any limitations on refcounting: -pmd can be split at any point and never fails. - -Partial unmap and deferred_split_huge_page() -============================================ - -Unmapping part of THP (with munmap() or other way) is not going to free -memory immediately. Instead, we detect that a subpage of THP is not in use -in page_remove_rmap() and queue the THP for splitting if memory pressure -comes. Splitting will free up unused subpages. - -Splitting the page right away is not an option due to locking context in -the place where we can detect partial unmap. It also might be -counterproductive since in many cases partial unmap happens during exit(2) if -a THP crosses a VMA boundary. - -The function deferred_split_huge_page() is used to queue a page for splitting. -The splitting itself will happen when we get memory pressure via shrinker -interface. diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst deleted file mode 100644 index b280367d6a44..000000000000 --- a/Documentation/vm/unevictable-lru.rst +++ /dev/null @@ -1,554 +0,0 @@ -.. _unevictable_lru: - -============================== -Unevictable LRU Infrastructure -============================== - -.. contents:: :local: - - -Introduction -============ - -This document describes the Linux memory manager's "Unevictable LRU" -infrastructure and the use of this to manage several types of "unevictable" -pages. - -The document attempts to provide the overall rationale behind this mechanism -and the rationale for some of the design decisions that drove the -implementation. The latter design rationale is discussed in the context of an -implementation description. Admittedly, one can obtain the implementation -details - the "what does it do?" - by reading the code. One hopes that the -descriptions below add value by provide the answer to "why does it do that?". - - - -The Unevictable LRU -=================== - -The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page -reclaim in Linux. The problems have been observed at customer sites on large -memory x86_64 systems. - -To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of -main memory will have over 32 million 4k pages in a single node. When a large -fraction of these pages are not evictable for any reason [see below], vmscan -will spend a lot of time scanning the LRU lists looking for the small fraction -of pages that are evictable. This can result in a situation where all CPUs are -spending 100% of their time in vmscan for hours or days on end, with the system -completely unresponsive. - -The unevictable list addresses the following classes of unevictable pages: - - * Those owned by ramfs. - - * Those mapped into SHM_LOCK'd shared memory regions. - - * Those mapped into VM_LOCKED [mlock()ed] VMAs. - -The infrastructure may also be able to handle other conditions that make pages -unevictable, either by definition or by circumstance, in the future. - - -The Unevictable LRU Page List ------------------------------ - -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. - -The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. - -The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when -PG_lru is set. - -The Unevictable LRU infrastructure maintains unevictable pages as if they were -on an additional LRU list for a few reasons: - - (1) We get to "treat unevictable pages just like we treat other pages in the - system - which means we get to use the same code to manipulate them, the - same code to isolate them (for migrate, etc.), the same code to keep track - of the statistics, etc..." [Rik van Riel] - - (2) We want to be able to migrate unevictable pages between nodes for memory - defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU - lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. - -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. - -The unevictable list benefits from the "arrayification" of the per-node LRU -lists and statistics originally proposed and posted by Christoph Lameter. - - -Memory Control Group Interaction --------------------------------- - -The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by -extending the lru_list enum. - -The memory controller data structure automatically gets a per-node unevictable -list as a result of the "arrayification" of the per-node LRU lists (one per -lru_list enum element). The memory controller tracks the movement of pages to -and from the unevictable list. - -When a memory control group comes under memory pressure, the controller will -not attempt to reclaim pages on the unevictable list. This has a couple of -effects: - - (1) Because the pages are "hidden" from reclaim on the unevictable list, the - reclaim process can be more efficient, dealing only with pages that have a - chance of being reclaimed. - - (2) On the other hand, if too many of the pages charged to the control group - are unevictable, the evictable portion of the working set of the tasks in - the control group may not fit into the available memory. This can cause - the control group to thrash or to OOM-kill tasks. - - -.. _mark_addr_space_unevict: - -Marking Address Spaces Unevictable ----------------------------------- - -For facilities such as ramfs none of the pages attached to the address space -may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE -address space flag is provided, and this can be manipulated by a filesystem -using a number of wrapper functions: - - * ``void mapping_set_unevictable(struct address_space *mapping);`` - - Mark the address space as being completely unevictable. - - * ``void mapping_clear_unevictable(struct address_space *mapping);`` - - Mark the address space as being evictable. - - * ``int mapping_unevictable(struct address_space *mapping);`` - - Query the address space, and return true if it is completely - unevictable. - -These are currently used in three places in the kernel: - - (1) By ramfs to mark the address spaces of its inodes when they are created, - and this mark remains for the life of the inode. - - (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. - Note that SHM_LOCK is not required to page in the locked pages if they're - swapped out; the application must touch the pages manually if it wants to - ensure they're in memory. - - (3) By the i915 driver to mark pinned address space until it's unpinned. The - amount of unevictable memory marked by i915 driver is roughly the bounded - object size in debugfs/dri/0/i915_gem_objects. - - -Detecting Unevictable Pages ---------------------------- - -The function page_evictable() in mm/internal.h determines whether a page is -evictable or not using the query function outlined above [see section -:ref:`Marking address spaces unevictable `] -to check the AS_UNEVICTABLE flag. - -For address spaces that are so marked after being populated (as SHM regions -might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate -the page tables for the region as does, for example, mlock(), nor need it make -any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during -a reclamation scan. - -On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan -the pages in the region and "rescue" them from the unevictable list if no other -condition is keeping them unevictable. If an unevictable region is destroyed, -the pages are also "rescued" from the unevictable list in the process of -freeing them. - -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. - - -Vmscan's Handling of Unevictable Pages --------------------------------------- - -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they -have become evictable again (via munlock() for example) and have been "rescued" -from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular -active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the -unevictable list for the memory cgroup and node being scanned. - -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. - -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. - - -MLOCKED Pages -============= - -The unevictable page list is also useful for mlock(), in addition to ramfs and -SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in -NOMMU situations, all mappings are effectively mlocked. - - -History -------- - -The "Unevictable mlocked Pages" infrastructure is based on work originally -posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". -Nick posted his patch as an alternative to a patch posted by Christoph Lameter -to achieve the same objective: hiding mlocked pages from vmscan. - -In Nick's patch, he used one of the struct page LRU list link fields as a count -of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years -earlier). But this use of the link field for a count prevented the management -of the pages on an LRU list, and thus mlocked pages were not migratable as -isolate_lru_page() could not detect them, and the LRU list link field was not -available to the migration subsystem. - -Nick resolved this by putting mlocked pages back on the LRU list before -attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When -Nick's patch was integrated with the Unevictable LRU work, the count was -replaced by walking the reverse map when munlocking, to determine whether any -other VM_LOCKED VMAs still mapped the page. - -However, walking the reverse map for each page when munlocking was ugly and -inefficient, and could lead to catastrophic contention on a file's rmap lock, -when many processes which had it mlocked were trying to exit. In 5.18, the -idea of keeping mlock_count in Unevictable LRU list link field was revived and -put to work, without preventing the migration of mlocked pages. This is why -the "Unevictable LRU list" cannot be a linked list of pages now; but there was -no use for that linked list anyway - though its size is maintained for meminfo. - - -Basic Management ----------------- - -mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable -pages. When such a page has been "noticed" by the memory management subsystem, -the page is marked with the PG_mlocked flag. This can be manipulated using the -PageMlocked() functions. - -A PG_mlocked page will be placed on the unevictable list when it is added to -the LRU. Such pages can be "noticed" by memory management in several places: - - (1) in the mlock()/mlock2()/mlockall() system call handlers; - - (2) in the mmap() system call handler when mmapping a region with the - MAP_LOCKED flag; - - (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE - flag; - - (4) in the fault path and when a VM_LOCKED stack segment is expanded; or - - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to - reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). - -mlocked pages become unlocked and rescued from the unevictable list when: - - (1) mapped in a range unlocked via the munlock()/munlockall() system calls; - - (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including - unmapping at task exit; - - (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; - or - - (4) before a page is COW'd in a VM_LOCKED VMA. - - -mlock()/mlock2()/mlockall() System Call Handling ------------------------------------------------- - -mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() -for each VMA in the range specified by the call. In the case of mlockall(), -this is the entire active address space of the task. Note that mlock_fixup() -is used for both mlocking and munlocking a range of memory. A call to mlock() -an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is -treated as a no-op and mlock_fixup() simply returns. - -If the VMA passes some filtering as described in "Filtering Special VMAs" -below, mlock_fixup() will attempt to merge the VMA with its neighbors or split -off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). - -Before returning from the system call, do_mlock() or mlockall() will call -__mm_populate() to fault in the remaining pages via get_user_pages() and to -mark those pages as mlocked as they are faulted. - -Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, -get_user_pages() will be unable to fault in the pages. That's okay. If pages -do end up getting faulted into this VM_LOCKED VMA, they will be handled in the -fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. - -For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED -(unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better -judgments, since this page is held exclusively and known not to be on LRU yet. - -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count -and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. - -But in practice that may not work ideally: the page may not yet be on an LRU, or -it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() -returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: -rather than risk stranding a page indefinitely as unevictable, always err with -mlock_count on the low side, so that when munlocked the page will be rescued to -an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a -VM_LOCKED VMA. - - -Filtering Special VMAs ----------------------- - -mlock_fixup() filters several classes of "special" VMAs: - -1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind - these mappings are inherently pinned, so we don't need to mark them as - mlocked. In any case, most of the pages have no struct page in which to so - mark the page. Because of this, get_user_pages() will fail for these VMAs, - so there is no sense in attempting to visit them. - -2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We - neither need nor want to mlock() these pages. But __mm_populate() includes - hugetlbfs ranges, allocating the huge pages and populating the PTEs. - -3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, - such as the VDSO page, relay channel pages, etc. These pages are inherently - unevictable and are not managed on the LRU lists. __mm_populate() includes - these ranges, populating the PTEs if not already populated. - -4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() - includes these ranges, populating the PTEs if not already populated. - -Note that for all of these special VMAs, mlock_fixup() does not set the -VM_LOCKED flag. Therefore, we won't have to deal with them later during -munlock(), munmap() or task exit. Neither does mlock_fixup() account these -VMAs against the task's "locked_vm". - - -munlock()/munlockall() System Call Handling -------------------------------------------- - -The munlock() and munlockall() system calls are handled by the same -mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. -If called to munlock an already munlocked VMA, mlock_fixup() simply returns. -Because of the VMA filtering discussed above, VM_LOCKED will not be set in -any "special" VMAs. So, those VMAs will be ignored for munlock. - -If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same -function used when mlocking a VMA range, with new flags for the VMA indicating -that it is munlock() being performed. - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Migrating MLOCKED Pages ------------------------ - -A page that is being migrated has been isolated from the LRU lists and is held -locked across unmapping of the page, updating the page's address space entry -and copying the contents and state, until the page table entry has been -replaced with an entry that refers to the new page. Linux supports migration -of mlocked pages and other unevictable pages. PG_mlocked is cleared from the -the old page when it is unmapped from the last VM_LOCKED VMA, and set when the -new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page -was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the -page was unevictable for other reasons, PG_unevictable is copied explicitly. - -Note that page migration can race with mlocking or munlocking of the same page. -There is mostly no problem since page migration requires unmapping all PTEs of -the old page (including munlock where VM_LOCKED), then mapping in the new page -(including mlock where VM_LOCKED). The page table locks provide sufficient -synchronization. - -However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, -before mlocking any pages already present, if one of those pages were migrated -before mlock_pte_range() reached it, it would get counted twice in mlock_count. -To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. - -To complete page migration, we place the old and new pages back onto the LRU -afterwards. The "unneeded" page - old page on success, new page on failure - -is freed when the reference count held by the migration process is released. - - -Compacting MLOCKED Pages ------------------------- - -The memory map can be scanned for compactable regions and the default behavior -is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed -controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work -of compaction is mostly handled by the page migration code and the same work -flow as described in Migrating MLOCKED Pages will apply. - - -MLOCKING Transparent Huge Pages -------------------------------- - -A transparent huge page is represented by a single entry on an LRU list. -Therefore, we can only make unevictable an entire compound page, not -individual subpages. - -If a user tries to mlock() part of a huge page, and no user mlock()s the -whole of the huge page, we want the rest of the page to be reclaimable. - -We cannot just split the page on partial mlock() as split_huge_page() can -fail and a new intermittent failure mode for the syscall is undesirable. - -We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: -the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. - -This way the huge page is accessible for vmscan. Under memory pressure the -page will be split, subpages which belong to VM_LOCKED VMAs will be moved -to the unevictable LRU and the rest can be reclaimed. - -/proc/meminfo's Unevictable and Mlocked amounts do not include those parts -of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. - - -mmap(MAP_LOCKED) System Call Handling -------------------------------------- - -In addition to the mlock(), mlock2() and mlockall() system calls, an application -can request that a region of memory be mlocked by supplying the MAP_LOCKED flag -to the mmap() call. There is one important and subtle difference here, though. -mmap() + mlock() will fail if the range cannot be faulted in (e.g. because -mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. -The mmaped area will still have properties of the locked area - pages will not -get swapped out - but major page faults to fault memory in might still happen. - -Furthermore, any mmap() call or brk() call that expands the heap by a task -that has previously called mlockall() with the MCL_FUTURE flag will result -in the newly mapped memory being mlocked. Before the unevictable/mlock -changes, the kernel simply called make_pages_present() to allocate pages -and populate the page table. - -To mlock a range of memory under the unevictable/mlock infrastructure, -the mmap() handler and task address space expansion functions call -populate_vma_page_range() specifying the vma and the address range to mlock. - - -munmap()/exit()/exec() System Call Handling -------------------------------------------- - -When unmapping an mlocked region of memory, whether by an explicit call to -munmap() or via an internal unmap from exit() or exec() processing, we must -munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any -way, so unmapping them required no processing. - -For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Truncating MLOCKED Pages ------------------------- - -File truncation or hole punching forcibly unmaps the deleted pages from -userspace; truncation even unmaps and deletes any private anonymous pages -which had been Copied-On-Write from the file pages now being truncated. - -Mlocked pages can be munlocked and deleted in this way: like with munmap(), -for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -However, if there is a racing munlock(), since mlock_vma_pages_range() starts -munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages -present, if one of those pages were unmapped by truncation or hole punch before -mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, -and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to -release_pages() (or __page_cache_release()) to clear it and update statistics -before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, -which is usually 0). - - -Page Reclaim in shrink_*_list() -------------------------------- - -vmscan's shrink_active_list() culls any obviously unevictable pages - -i.e. !page_evictable(page) pages - diverting those to the unevictable list. -However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable -set - otherwise they would be on the unevictable list and shrink_active_list() -would never see them. - -Some examples of these unevictable pages on the LRU lists are: - - (1) ramfs pages that have been placed on the LRU lists when first allocated. - - (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to - allocate or fault in the pages in the shared memory region. This happens - when an application accesses the page the first time after SHM_LOCK'ing - the segment. - - (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, - but events left mlock_count too low, so they were munlocked too early. - -vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously -unevictable pages found on the inactive lists to the appropriate memory cgroup -and node unevictable list. - -rmap's page_referenced_one(), called via vmscan's shrink_active_list() or -shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() -to correct them. Such pages are culled to the unevictable list when released -by the shrinker. diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst deleted file mode 100644 index 363fe20d6b9f..000000000000 --- a/Documentation/vm/vmalloc.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================================== -Virtually Contiguous Memory Allocation -====================================== diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst deleted file mode 100644 index fc8c67833af6..000000000000 --- a/Documentation/vm/vmalloced-kernel-stacks.rst +++ /dev/null @@ -1,153 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -===================================== -Virtually Mapped Kernel Stack Support -===================================== - -:Author: Shuah Khan - -.. contents:: :local: - -Overview --------- - -This is a compilation of information from the code and original patch -series that introduced the `Virtually Mapped Kernel Stacks feature -` - -Introduction ------------- - -Kernel stack overflows are often hard to debug and make the kernel -susceptible to exploits. Problems could show up at a later time making -it difficult to isolate and root-cause. - -Virtually-mapped kernel stacks with guard pages causes kernel stack -overflows to be caught immediately rather than causing difficult to -diagnose corruptions. - -HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable -support for virtually mapped stacks with guard pages. This feature -causes reliable faults when the stack overflows. The usability of -the stack trace after overflow and response to the overflow itself -is architecture dependent. - -.. note:: - As of this writing, arm64, powerpc, riscv, s390, um, and x86 have - support for VMAP_STACK. - -HAVE_ARCH_VMAP_STACK --------------------- - -Architectures that can support Virtually Mapped Kernel Stacks should -enable this bool configuration option. The requirements are: - -- vmalloc space must be large enough to hold many kernel stacks. This - may rule out many 32-bit architectures. -- Stacks in vmalloc space need to work reliably. For example, if - vmap page tables are created on demand, either this mechanism - needs to work while the stack points to a virtual address with - unpopulated page tables or arch code (switch_to() and switch_mm(), - most likely) needs to ensure that the stack's page table entries - are populated before running on a possibly unpopulated stack. -- If the stack overflows into a guard page, something reasonable - should happen. The definition of "reasonable" is flexible, but - instantly rebooting without logging anything would be unfriendly. - -VMAP_STACK ----------- - -VMAP_STACK bool configuration option when enabled allocates virtually -mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. - -- Enable this if you want the use virtually-mapped kernel stacks - with guard pages. This causes kernel stack overflows to be caught - immediately rather than causing difficult-to-diagnose corruption. - -.. note:: - - Using this feature with KASAN requires architecture support - for backing virtual mappings with real shadow memory, and - KASAN_VMALLOC must be enabled. - -.. note:: - - VMAP_STACK is enabled, it is not possible to run DMA on stack - allocated data. - -Kernel configuration options and dependencies keep changing. Refer to -the latest code base: - -`Kconfig ` - -Allocation ------------ - -When a new kernel thread is created, thread stack is allocated from -virtually contiguous memory pages from the page level allocator. These -pages are mapped into contiguous kernel virtual space with PAGE_KERNEL -protections. - -alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack -with PAGE_KERNEL protections. - -- Allocated stacks are cached and later reused by new threads, so memcg - accounting is performed manually on assigning/releasing stacks to tasks. - Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. -- vm_struct is cached to be able to find when thread free is initiated - in interrupt context. free_thread_stack() can be called in interrupt - context. -- On arm64, all VMAP's stacks need to have the same alignment to ensure - that VMAP'd stack overflow detection works correctly. Arch specific - vmap stack allocator takes care of this detail. -- This does not address interrupt stacks - according to the original patch - -Thread stack allocation is initiated from clone(), fork(), vfork(), -kernel_thread() via kernel_clone(). Leaving a few hints for searching -the code base to understand when and how thread stack is allocated. - -Bulk of the code is in: -`kernel/fork.c `. - -stack_vm_area pointer in task_struct keeps track of the virtually allocated -stack and a non-null stack_vm_area pointer serves as a indication that the -virtually mapped kernel stacks are enabled. - -:: - - struct vm_struct *stack_vm_area; - -Stack overflow handling ------------------------ - -Leading and trailing guard pages help detect stack overflows. When stack -overflows into the guard pages, handlers have to be careful not overflow -the stack again. When handlers are called, it is likely that very little -stack space is left. - -On x86, this is done by handling the page fault indicating the kernel -stack overflow on the double-fault stack. - -Testing VMAP allocation with guard pages ----------------------------------------- - -How do we ensure that VMAP_STACK is actually allocating with a leading -and trailing guard page? The following lkdtm tests can help detect any -regressions. - -:: - - void lkdtm_STACK_GUARD_PAGE_LEADING() - void lkdtm_STACK_GUARD_PAGE_TRAILING() - -Conclusions ------------ - -- A percpu cache of vmalloced stacks appears to be a bit faster than a - high-order stack allocation, at least when the cache hits. -- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and - simply embed the thread_info (containing only flags) and 'int cpu' into - task_struct. -- The thread stack can be free'ed as soon as the task is dead (without - waiting for RCU) and then, if vmapped stacks are in use, cache the - entire stack for reuse on the same cpu. diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst deleted file mode 100644 index c9c495f62d12..000000000000 --- a/Documentation/vm/vmemmap_dedup.rst +++ /dev/null @@ -1,223 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================================= -A vmemmap diet for HugeTLB and Device DAX -========================================= - -HugeTLB -======= - -The struct page structures (page structs) are used to describe a physical -page frame. By default, there is a one-to-one mapping from a page frame to -it's corresponding page struct. - -HugeTLB pages consist of multiple base page size pages and is supported by many -architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more -details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are -currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page -consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. -For each base page, there is a corresponding page struct. - -Within the HugeTLB subsystem, only the first 4 page structs are used to -contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides -this upper limit. The only 'useful' information in the remaining page structs -is the compound_head field, and this field is the same for all tail pages. - -By removing redundant page structs for HugeTLB pages, memory can be returned -to the buddy allocator for other uses. - -Different architectures support different HugeTLB pages. For example, the -following table is the HugeTLB page size supported by x86 and arm64 -architectures. Because arm64 supports 4k, 16k, and 64k base pages and -supports contiguous entries, so it supports many kinds of sizes of HugeTLB -page. - -+--------------+-----------+-----------------------------------------------+ -| Architecture | Page Size | HugeTLB Page Size | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| x86-64 | 4KB | 2MB | 1GB | | | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| | 4KB | 64KB | 2MB | 32MB | 1GB | -| +-----------+-----------+-----------+-----------+-----------+ -| arm64 | 16KB | 2MB | 32MB | 1GB | | -| +-----------+-----------+-----------+-----------+-----------+ -| | 64KB | 2MB | 512MB | 16GB | | -+--------------+-----------+-----------+-----------+-----------+-----------+ - -When the system boot up, every HugeTLB page has more than one struct page -structs which size is (unit: pages):: - - struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - -Where HugeTLB_Size is the size of the HugeTLB page. We know that the size -of the HugeTLB page is always n times PAGE_SIZE. So we can get the following -relationship:: - - HugeTLB_Size = n * PAGE_SIZE - -Then:: - - struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - = n * sizeof(struct page) / PAGE_SIZE - -We can use huge mapping at the pud/pmd level for the HugeTLB page. - -For the HugeTLB page of the pmd level mapping, then:: - - struct_size = n * sizeof(struct page) / PAGE_SIZE - = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE - = sizeof(struct page) / sizeof(pte_t) - = 64 / 8 - = 8 (pages) - -Where n is how many pte entries which one page can contains. So the value of -n is (PAGE_SIZE / sizeof(pte_t)). - -This optimization only supports 64-bit system, so the value of sizeof(pte_t) -is 8. And this optimization also applicable only when the size of struct page -is a power of two. In most cases, the size of struct page is 64 bytes (e.g. -x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the -size of struct page structs of it is 8 page frames which size depends on the -size of the base page. - -For the HugeTLB page of the pud level mapping, then:: - - struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) - = PAGE_SIZE / 8 * 8 (pages) - = PAGE_SIZE (pages) - -Where the struct_size(pmd) is the size of the struct page structs of a -HugeTLB page of the pmd level mapping. - -E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB -HugeTLB page consists in 4096. - -Next, we take the pmd level mapping of the HugeTLB page as an example to -show the internal implementation of this optimization. There are 8 pages -struct page structs associated with a HugeTLB page which is pmd mapped. - -Here is how things look before optimization:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | -------------> | 2 | - | | +-----------+ +-----------+ - | | | 3 | -------------> | 3 | - | | +-----------+ +-----------+ - | | | 4 | -------------> | 4 | - | PMD | +-----------+ +-----------+ - | level | | 5 | -------------> | 5 | - | mapping | +-----------+ +-----------+ - | | | 6 | -------------> | 6 | - | | +-----------+ +-----------+ - | | | 7 | -------------> | 7 | - | | +-----------+ +-----------+ - | | - | | - | | - +-----------+ - -The value of page->compound_head is the same for all tail pages. The first -page of page structs (page 0) associated with the HugeTLB page contains the 4 -page structs necessary to describe the HugeTLB. The only use of the remaining -pages of page structs (page 1 to page 7) is to point to page->compound_head. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs -will be used for each HugeTLB page. This will allow us to free the remaining -7 pages to the buddy allocator. - -Here is how things look after remapping:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | | - | | | 2 | -----------------+ | | | | | - | | +-----------+ | | | | | - | | | 3 | -------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | ---------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | -----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | -------------------------+ | - | | +-----------+ | - | | | 7 | ---------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ - -When a HugeTLB is freed to the buddy system, we should allocate 7 pages for -vmemmap pages and restore the previous mapping relationship. - -For the HugeTLB page of the pud level mapping. It is similar to the former. -We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. - -Apart from the HugeTLB page of the pmd/pud level mapping, some architectures -(e.g. aarch64) provides a contiguous bit in the translation table entries -that hints to the MMU to indicate that it is one of a contiguous set of -entries that can be cached in a single TLB entry. - -The contiguous bit is used to increase the mapping size at the pmd and pte -(last) level. So this type of HugeTLB page can be optimized only when its -size of the struct page structs is greater than 1 page. - -Notice: The head vmemmap page is not freed to the buddy allocator and all -tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) -associated with each HugeTLB page. The compound_head() can handle this -correctly (more details refer to the comment above compound_head()). - -Device DAX -========== - -The device-dax interface uses the same tail deduplication technique explained -in the previous chapter, except when used with the vmemmap in -the device (altmap). - -The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), -PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). - -The differences with HugeTLB are relatively minor. - -It only use 3 page structs for storing all information as opposed -to 4 on HugeTLB pages. - -There's no remapping of vmemmap given that device-dax memory is not part of -System RAM ranges initialized at boot. Thus the tail page deduplication -happens at a later stage when we populate the sections. HugeTLB reuses the -the head vmemmap page representing, whereas device-dax reuses the tail -vmemmap page. This results in only half of the savings compared to HugeTLB. - -Deduplicated tail pages are not mapped read-only. - -Here's how things look like on device-dax after the sections are populated:: - - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | ----------------^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | - | | | 3 | ------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | --------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | ----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | ------------------------+ | - | | +-----------+ | - | | | 7 | --------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst deleted file mode 100644 index 224e3c61d686..000000000000 --- a/Documentation/vm/z3fold.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _z3fold: - -====== -z3fold -====== - -z3fold is a special purpose allocator for storing compressed pages. -It is designed to store up to three compressed pages per physical page. -It is a zbud derivative which allows for higher compression -ratio keeping the simplicity and determinism of its predecessor. - -The main differences between z3fold and zbud are: - -* unlike zbud, z3fold allows for up to PAGE_SIZE allocations -* z3fold can hold up to 3 compressed pages in its page -* z3fold doesn't export any API itself and is thus intended to be used - via the zpool API. - -To keep the determinism and simplicity, z3fold, just like zbud, always -stores an integral number of compressed pages per page, but it can store -up to 3 pages unlike zbud which can store at most 2. Therefore the -compression ratio goes to around 2.7x while zbud's one is around 1.7x. - -Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not -return a dereferenceable pointer. Instead, it returns an unsigned long -handle which encodes actual location of the allocated object. - -Keeping effective compression ratio close to zsmalloc's, z3fold doesn't -depend on MMU enabled and provides more predictable reclaim behavior -which makes it a better fit for small and response-critical systems. diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst deleted file mode 100644 index 6e79893d6132..000000000000 --- a/Documentation/vm/zsmalloc.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. _zsmalloc: - -======== -zsmalloc -======== - -This allocator is designed for use with zram. Thus, the allocator is -supposed to work well under low memory conditions. In particular, it -never attempts higher order page allocation which is very likely to -fail under memory pressure. On the other hand, if we just use single -(0-order) pages, it would suffer from very high fragmentation -- -any object of size PAGE_SIZE/2 or larger would occupy an entire page. -This was one of the major issues with its predecessor (xvmalloc). - -To overcome these issues, zsmalloc allocates a bunch of 0-order pages -and links them together using various 'struct page' fields. These linked -pages act as a single higher-order page i.e. an object can span 0-order -page boundaries. The code refers to these linked pages as a single entity -called zspage. - -For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE -since this satisfies the requirements of all its current users (in the -worst case, page is incompressible and is thus stored "as-is" i.e. in -uncompressed form). For allocation requests larger than this size, failure -is returned (see zs_malloc). - -Additionally, zs_malloc() does not return a dereferenceable pointer. -Instead, it returns an opaque handle (unsigned long) which encodes actual -location of the allocated object. The reason for this indirection is that -zsmalloc does not keep zspages permanently mapped since that would cause -issues on 32-bit systems where the VA region for kernel space mappings -is very small. So, before using the allocating memory, the object has to -be mapped using zs_map_object() to get a usable pointer and subsequently -unmapped using zs_unmap_object(). - -stat -==== - -With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via -``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - index -size - object size zspage stores -almost_empty - the number of ZS_ALMOST_EMPTY zspages(see below) -almost_full - the number of ZS_ALMOST_FULL zspages(see below) -obj_allocated - the number of objects allocated -obj_used - the number of objects allocated to the user -pages_used - the number of pages allocated for the class -pages_per_zspage - the number of 0-order pages to make a zspage - -We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where - -* n = number of allocated objects -* N = total number of objects zspage can store -* f = fullness_threshold_frac(ie, 4 at the moment) - -Similarly, we assign zspage to: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf141501..55fb1daa9057 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5526,7 +5526,7 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ -F: Documentation/vm/damon/ +F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ @@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management M: Jérôme Glisse L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/hmm.rst +F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* @@ -9135,8 +9135,8 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages F: Documentation/admin-guide/mm/hugetlbpage.rst -F: Documentation/vm/hugetlbfs_reserv.rst -F: Documentation/vm/vmemmap_dedup.rst +F: Documentation/mm/hugetlbfs_reserv.rst +F: Documentation/mm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: mm/hugetlb.c @@ -15072,7 +15072,7 @@ M: Pasha Tatashin M: Andrew Morton L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/page_table_check.rst +F: Documentation/mm/page_table_check.rst F: include/linux/page_table_check.h F: mm/page_table_check.c @@ -22158,7 +22158,7 @@ M: Nitin Gupta R: Sergey Senozhatsky L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/zsmalloc.rst +F: Documentation/mm/zsmalloc.rst F: include/linux/zsmalloc.h F: mm/zsmalloc.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1920d52653b4..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_ENABLE_THP_MIGRATION def_bool y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb9d5fd39d7f..392ff48f77df 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, * should return true. * We should not call this on a hugetlb entry. We should check for HugeTLB * entry using vma->vm_flags - * The page table walk rule is explained in Documentation/vm/transhuge.rst + * The page table walk rule is explained in Documentation/mm/transhuge.rst */ static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d5a6f101f843..126a36571667 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -4,7 +4,7 @@ * * Authors: Jérôme Glisse * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. + * See Documentation/mm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -100,7 +100,7 @@ struct hmm_range { }; /* - * Please see Documentation/vm/hmm.rst for how to use the range API. + * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..9f5ee49482de 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,7 +39,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. + * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..d6c06e140277 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -198,7 +198,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.rst + * discussion on this see Documentation/mm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8cd975a8bfeb..2a243616f222 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0c0fed1b348f..95a5b7aa1ae9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,7 +74,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.rst. Short description is we need struct pages for + * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * diff --git a/mm/Kconfig b/mm/Kconfig index 169e64192e48..c1fa4993a56f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -663,7 +663,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.rst for more information: KSM is inactive + See Documentation/mm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1ab091f49fc0..dc7df1254f0a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -35,7 +35,7 @@ #include /* - * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics + * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. */ diff --git a/mm/frontswap.c b/mm/frontswap.c index 6f69b044a8cc..1a97610308cb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -4,7 +4,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.rst for more information. + * Documentation/mm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 834f288b3769..f9b90a8d7dfa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..b36a4ef87a2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4875,7 +4875,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1089ea8a9c98..ba29c15c53d6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,7 +6,7 @@ * * Author: Muchun Song * - * See Documentation/vm/vmemmap_dedup.rst + * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecae..8d2dc501c92c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); diff --git a/mm/mmap.c b/mm/mmap.c index 61e6135c54ef..c14d7286a379 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) diff --git a/mm/rmap.c b/mm/rmap.c index 5bcb334cd6f2..65e0a767b837 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ if (ret) cleaned++; @@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * to point at a new folio while a device is * still using this folio. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); } @@ -1775,7 +1775,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) @@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 652f11a05749..3ff88a2eefb8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, /* * Reuse the previous page for the rest of tail pages - * See layout diagram in Documentation/vm/vmemmap_dedup.rst + * See layout diagram in Documentation/mm/vmemmap_dedup.rst */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, diff --git a/mm/util.c b/mm/util.c index 0837570c9225..5df8f2db7ca9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c149427eb1c9..74c3dcecf64d 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -8,7 +8,7 @@ * Or sort by total memory: * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * - * See Documentation/vm/page_owner.rst + * See Documentation/mm/page_owner.rst */ #include -- cgit v1.2.3 From 4313a24985f00340eeb591fd66aa2b257b9e0a69 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 May 2022 21:59:02 +0200 Subject: arch/*/: remove CONFIG_VIRT_TO_BUS All architecture-independent users of virt_to_bus() and bus_to_virt() have been fixed to use the dma mapping interfaces or have been removed now. This means the definitions on most architectures, and the CONFIG_VIRT_TO_BUS symbol are now obsolete and can be removed. The only exceptions to this are a few network and scsi drivers for m68k Amiga and VME machines and ppc32 Macintosh. These drivers work correctly with the old interfaces and are probably not worth changing. On alpha and parisc, virt_to_bus() were still used in asm/floppy.h. alpha can use isa_virt_to_bus() like x86 does, and parisc can just open-code the virt_to_phys() here, as this is architecture specific code. I tried updating the bus-virt-phys-mapping.rst documentation, which started as an email from Linus to explain some details of the Linux-2.0 driver interfaces. The bits about virt_to_bus() were declared obsolete backin 2000, and the rest is not all that relevant any more, so in the end I just decided to remove the file completely. Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Michael Ellerman (powerpc) Acked-by: Helge Deller # parisc Signed-off-by: Arnd Bergmann --- Documentation/core-api/bus-virt-phys-mapping.rst | 220 --------------------- Documentation/core-api/dma-api-howto.rst | 14 -- Documentation/core-api/index.rst | 1 - .../translations/zh_CN/core-api/index.rst | 1 - arch/alpha/Kconfig | 1 - arch/alpha/include/asm/floppy.h | 2 +- arch/alpha/include/asm/io.h | 8 +- arch/ia64/Kconfig | 1 - arch/ia64/include/asm/io.h | 8 - arch/m68k/Kconfig | 1 - arch/m68k/include/asm/virtconvert.h | 4 +- arch/microblaze/Kconfig | 1 - arch/microblaze/include/asm/io.h | 2 - arch/mips/Kconfig | 1 - arch/mips/include/asm/io.h | 9 - arch/parisc/Kconfig | 1 - arch/parisc/include/asm/floppy.h | 4 +- arch/parisc/include/asm/io.h | 2 - arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/io.h | 2 - arch/riscv/include/asm/page.h | 1 - arch/x86/Kconfig | 1 - arch/x86/include/asm/io.h | 9 - arch/xtensa/Kconfig | 1 - arch/xtensa/include/asm/io.h | 3 - include/asm-generic/io.h | 14 -- mm/Kconfig | 8 - 27 files changed, 10 insertions(+), 311 deletions(-) delete mode 100644 Documentation/core-api/bus-virt-phys-mapping.rst (limited to 'arch/powerpc/include') diff --git a/Documentation/core-api/bus-virt-phys-mapping.rst b/Documentation/core-api/bus-virt-phys-mapping.rst deleted file mode 100644 index c72b24a7d52c..000000000000 --- a/Documentation/core-api/bus-virt-phys-mapping.rst +++ /dev/null @@ -1,220 +0,0 @@ -========================================================== -How to access I/O mapped memory from within device drivers -========================================================== - -:Author: Linus - -.. warning:: - - The virt_to_bus() and bus_to_virt() functions have been - superseded by the functionality provided by the PCI DMA interface - (see Documentation/core-api/dma-api-howto.rst). They continue - to be documented below for historical purposes, but new code - must not use them. --davidm 00/12/12 - -:: - - [ This is a mail message in response to a query on IO mapping, thus the - strange format for a "document" ] - -The AHA-1542 is a bus-master device, and your patch makes the driver give the -controller the physical address of the buffers, which is correct on x86 -(because all bus master devices see the physical memory mappings directly). - -However, on many setups, there are actually **three** different ways of looking -at memory addresses, and in this case we actually want the third, the -so-called "bus address". - -Essentially, the three ways of addressing memory are (this is "real memory", -that is, normal RAM--see later about other details): - - - CPU untranslated. This is the "physical" address. Physical address - 0 is what the CPU sees when it drives zeroes on the memory bus. - - - CPU translated address. This is the "virtual" address, and is - completely internal to the CPU itself with the CPU doing the appropriate - translations into "CPU untranslated". - - - bus address. This is the address of memory as seen by OTHER devices, - not the CPU. Now, in theory there could be many different bus - addresses, with each device seeing memory in some device-specific way, but - happily most hardware designers aren't actually actively trying to make - things any more complex than necessary, so you can assume that all - external hardware sees the memory the same way. - -Now, on normal PCs the bus address is exactly the same as the physical -address, and things are very simple indeed. However, they are that simple -because the memory and the devices share the same address space, and that is -not generally necessarily true on other PCI/ISA setups. - -Now, just as an example, on the PReP (PowerPC Reference Platform), the -CPU sees a memory map something like this (this is from memory):: - - 0-2 GB "real memory" - 2 GB-3 GB "system IO" (inb/out and similar accesses on x86) - 3 GB-4 GB "IO memory" (shared memory over the IO bus) - -Now, that looks simple enough. However, when you look at the same thing from -the viewpoint of the devices, you have the reverse, and the physical memory -address 0 actually shows up as address 2 GB for any IO master. - -So when the CPU wants any bus master to write to physical memory 0, it -has to give the master address 0x80000000 as the memory address. - -So, for example, depending on how the kernel is actually mapped on the -PPC, you can end up with a setup like this:: - - physical address: 0 - virtual address: 0xC0000000 - bus address: 0x80000000 - -where all the addresses actually point to the same thing. It's just seen -through different translations.. - -Similarly, on the Alpha, the normal translation is:: - - physical address: 0 - virtual address: 0xfffffc0000000000 - bus address: 0x40000000 - -(but there are also Alphas where the physical address and the bus address -are the same). - -Anyway, the way to look up all these translations, you do:: - - #include - - phys_addr = virt_to_phys(virt_addr); - virt_addr = phys_to_virt(phys_addr); - bus_addr = virt_to_bus(virt_addr); - virt_addr = bus_to_virt(bus_addr); - -Now, when do you need these? - -You want the **virtual** address when you are actually going to access that -pointer from the kernel. So you can have something like this:: - - /* - * this is the hardware "mailbox" we use to communicate with - * the controller. The controller sees this directly. - */ - struct mailbox { - __u32 status; - __u32 bufstart; - __u32 buflen; - .. - } mbox; - - unsigned char * retbuffer; - - /* get the address from the controller */ - retbuffer = bus_to_virt(mbox.bufstart); - switch (retbuffer[0]) { - case STATUS_OK: - ... - -on the other hand, you want the bus address when you have a buffer that -you want to give to the controller:: - - /* ask the controller to read the sense status into "sense_buffer" */ - mbox.bufstart = virt_to_bus(&sense_buffer); - mbox.buflen = sizeof(sense_buffer); - mbox.status = 0; - notify_controller(&mbox); - -And you generally **never** want to use the physical address, because you can't -use that from the CPU (the CPU only uses translated virtual addresses), and -you can't use it from the bus master. - -So why do we care about the physical address at all? We do need the physical -address in some cases, it's just not very often in normal code. The physical -address is needed if you use memory mappings, for example, because the -"remap_pfn_range()" mm function wants the physical address of the memory to -be remapped as measured in units of pages, a.k.a. the pfn (the memory -management layer doesn't know about devices outside the CPU, so it -shouldn't need to know about "bus addresses" etc). - -.. note:: - - The above is only one part of the whole equation. The above - only talks about "real memory", that is, CPU memory (RAM). - -There is a completely different type of memory too, and that's the "shared -memory" on the PCI or ISA bus. That's generally not RAM (although in the case -of a video graphics card it can be normal DRAM that is just used for a frame -buffer), but can be things like a packet buffer in a network card etc. - -This memory is called "PCI memory" or "shared memory" or "IO memory" or -whatever, and there is only one way to access it: the readb/writeb and -related functions. You should never take the address of such memory, because -there is really nothing you can do with such an address: it's not -conceptually in the same memory space as "real memory" at all, so you cannot -just dereference a pointer. (Sadly, on x86 it **is** in the same memory space, -so on x86 it actually works to just deference a pointer, but it's not -portable). - -For such memory, you can do things like: - - - reading:: - - /* - * read first 32 bits from ISA memory at 0xC0000, aka - * C000:0000 in DOS terms - */ - unsigned int signature = isa_readl(0xC0000); - - - remapping and writing:: - - /* - * remap framebuffer PCI memory area at 0xFC000000, - * size 1MB, so that we can access it: We can directly - * access only the 640k-1MB area, so anything else - * has to be remapped. - */ - void __iomem *baseptr = ioremap(0xFC000000, 1024*1024); - - /* write a 'A' to the offset 10 of the area */ - writeb('A',baseptr+10); - - /* unmap when we unload the driver */ - iounmap(baseptr); - - - copying and clearing:: - - /* get the 6-byte Ethernet address at ISA address E000:0040 */ - memcpy_fromio(kernel_buffer, 0xE0040, 6); - /* write a packet to the driver */ - memcpy_toio(0xE1000, skb->data, skb->len); - /* clear the frame buffer */ - memset_io(0xA0000, 0, 0x10000); - -OK, that just about covers the basics of accessing IO portably. Questions? -Comments? You may think that all the above is overly complex, but one day you -might find yourself with a 500 MHz Alpha in front of you, and then you'll be -happy that your driver works ;) - -Note that kernel versions 2.0.x (and earlier) mistakenly called the -ioremap() function "vremap()". ioremap() is the proper name, but I -didn't think straight when I wrote it originally. People who have to -support both can do something like:: - - /* support old naming silliness */ - #if LINUX_VERSION_CODE < 0x020100 - #define ioremap vremap - #define iounmap vfree - #endif - -at the top of their source files, and then they can use the right names -even on 2.0.x systems. - -And the above sounds worse than it really is. Most real drivers really -don't do all that complex things (or rather: the complexity is not so -much in the actual IO accesses as in error handling and timeouts etc). -It's generally not hard to fix drivers, and in many cases the code -actually looks better afterwards:: - - unsigned long signature = *(unsigned int *) 0xC0000; - vs - unsigned long signature = readl(0xC0000); - -I think the second version actually is more readable, no? diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst index 358d495456d1..828846804e25 100644 --- a/Documentation/core-api/dma-api-howto.rst +++ b/Documentation/core-api/dma-api-howto.rst @@ -707,20 +707,6 @@ to use the dma_sync_*() interfaces:: } } -Drivers converted fully to this interface should not use virt_to_bus() any -longer, nor should they use bus_to_virt(). Some drivers have to be changed a -little bit, because there is no longer an equivalent to bus_to_virt() in the -dynamic DMA mapping scheme - you have to always store the DMA addresses -returned by the dma_alloc_coherent(), dma_pool_alloc(), and dma_map_single() -calls (dma_map_sg() stores them in the scatterlist itself if the platform -supports dynamic DMA mapping in hardware) in your driver structures and/or -in the card registers. - -All drivers should be using these interfaces with no exceptions. It -is planned to completely remove virt_to_bus() and bus_to_virt() as -they are entirely deprecated. Some ports already do not provide these -as it is impossible to correctly support them. - Handling Errors =============== diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index dedd4d853329..726065a3095e 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -41,7 +41,6 @@ Library functionality that is used throughout the kernel. rbtree generic-radix-tree packing - bus-virt-phys-mapping this_cpu_ops timekeeping errseq diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 26d9913fc8b6..c52175fc1b61 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -52,7 +52,6 @@ Todolist: circular-buffers generic-radix-tree packing - bus-virt-phys-mapping this_cpu_ops timekeeping errseq diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 7d0d26b5b3f5..97fce7386b00 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -17,7 +17,6 @@ config ALPHA select HAVE_PERF_EVENTS select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH - select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP select AUTO_IRQ_AFFINITY if SMP diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h index 588758685439..64b42d9591fc 100644 --- a/arch/alpha/include/asm/floppy.h +++ b/arch/alpha/include/asm/floppy.h @@ -20,7 +20,7 @@ #define fd_free_dma() free_dma(FLOPPY_DMA) #define fd_clear_dma_ff() clear_dma_ff(FLOPPY_DMA) #define fd_set_dma_mode(mode) set_dma_mode(FLOPPY_DMA,mode) -#define fd_set_dma_addr(addr) set_dma_addr(FLOPPY_DMA,virt_to_bus(addr)) +#define fd_set_dma_addr(addr) set_dma_addr(FLOPPY_DMA,isa_virt_to_bus(addr)) #define fd_set_dma_count(count) set_dma_count(FLOPPY_DMA,count) #define fd_enable_irq() enable_irq(FLOPPY_IRQ) #define fd_disable_irq() disable_irq(FLOPPY_IRQ) diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index c9cb554fbe54..d277189b2677 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -106,15 +106,15 @@ static inline void * phys_to_virt(unsigned long address) extern unsigned long __direct_map_base; extern unsigned long __direct_map_size; -static inline unsigned long __deprecated virt_to_bus(volatile void *address) +static inline unsigned long __deprecated isa_virt_to_bus(volatile void *address) { unsigned long phys = virt_to_phys(address); unsigned long bus = phys + __direct_map_base; return phys <= __direct_map_size ? bus : 0; } -#define isa_virt_to_bus virt_to_bus +#define isa_virt_to_bus isa_virt_to_bus -static inline void * __deprecated bus_to_virt(unsigned long address) +static inline void * __deprecated isa_bus_to_virt(unsigned long address) { void *virt; @@ -125,7 +125,7 @@ static inline void * __deprecated bus_to_virt(unsigned long address) virt = phys_to_virt(address); return (long)address <= 0 ? NULL : virt; } -#define isa_bus_to_virt bus_to_virt +#define isa_bus_to_virt isa_bus_to_virt /* * There are different chipsets to interface the Alpha CPUs to the world. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index cb93769a9f2a..26ac8ea15a9e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -39,7 +39,6 @@ config IA64 select HAVE_FUNCTION_DESCRIPTORS select HAVE_VIRT_CPU_ACCOUNTING select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE - select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 6d93b923b379..ce66dfc0e719 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -96,14 +96,6 @@ extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size); extern int valid_phys_addr_range (phys_addr_t addr, size_t count); /* efi.c */ extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count); -/* - * The following two macros are deprecated and scheduled for removal. - * Please use the PCI-DMA interface defined in instead. - */ -#define bus_to_virt phys_to_virt -#define virt_to_bus virt_to_phys -#define page_to_bus page_to_phys - # endif /* KERNEL */ /* diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 936cce42ae9a..b06faf6c0b27 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -30,7 +30,6 @@ config M68K select OLD_SIGACTION select OLD_SIGSUSPEND3 select UACCESS_MEMCPY if !MMU - select VIRT_TO_BUS select ZONE_DMA config CPU_BIG_ENDIAN diff --git a/arch/m68k/include/asm/virtconvert.h b/arch/m68k/include/asm/virtconvert.h index ca91b32dc6ef..0a27905b0036 100644 --- a/arch/m68k/include/asm/virtconvert.h +++ b/arch/m68k/include/asm/virtconvert.h @@ -33,9 +33,11 @@ static inline void *phys_to_virt(unsigned long address) /* * IO bus memory addresses are 1:1 with the physical address, + * deprecated globally but still used on two machines. */ +#if defined(CONFIG_AMIGA) || defined(CONFIG_VME) #define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt +#endif #endif #endif diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 8cf429ad1c84..415182eeb082 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -38,7 +38,6 @@ config MICROBLAZE select OF_EARLY_FLATTREE select PCI_DOMAINS_GENERIC if PCI select PCI_SYSCALL if PCI - select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE select SPARSE_IRQ diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index b6a57f8468f0..c1d78b8977a6 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -30,8 +30,6 @@ extern resource_size_t isa_mem_base; #define PCI_IOBASE ((void __iomem *)_IO_BASE) #define IO_SPACE_LIMIT (0xFFFFFFFF) -#define page_to_bus(page) (page_to_phys(page)) - extern void iounmap(volatile void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index db09d45d59ec..ff0c76dfbbad 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -100,7 +100,6 @@ config MIPS select RTC_LIB select SYSCTL_EXCEPTION_TRACE select TRACE_IRQFLAGS_SUPPORT - select VIRT_TO_BUS select ARCH_HAS_ELFCORE_COMPAT select HAVE_ARCH_KCSAN if 64BIT diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 6f5c86d2bab4..cd9168f34fb7 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -147,15 +147,6 @@ static inline void *isa_bus_to_virt(unsigned long address) return phys_to_virt(address); } -/* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them for x86 for legacy drivers, though. - */ -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - /* * Change "struct page" to physical address. */ diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 5f2448dc5a2b..b0d68e9e2df0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -43,7 +43,6 @@ config PARISC select SYSCTL_ARCH_UNALIGN_ALLOW select SYSCTL_EXCEPTION_TRACE select HAVE_MOD_ARCH_SPECIFIC - select VIRT_TO_BUS select MODULES_USE_ELF_RELA select CLONE_BACKWARDS select TTY # Needed for pdc_cons.c diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h index 762cfe7778c0..b318a7df52f6 100644 --- a/arch/parisc/include/asm/floppy.h +++ b/arch/parisc/include/asm/floppy.h @@ -179,7 +179,7 @@ static void _fd_chose_dma_mode(char *addr, unsigned long size) { if(can_use_virtual_dma == 2) { if((unsigned int) addr >= (unsigned int) high_memory || - virt_to_bus(addr) >= 0x1000000 || + virt_to_phys(addr) >= 0x1000000 || _CROSS_64KB(addr, size, 0)) use_virtual_dma = 1; else @@ -215,7 +215,7 @@ static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) doing_pdma = 0; clear_dma_ff(FLOPPY_DMA); set_dma_mode(FLOPPY_DMA,mode); - set_dma_addr(FLOPPY_DMA,virt_to_bus(addr)); + set_dma_addr(FLOPPY_DMA,virt_to_phys(addr)); set_dma_count(FLOPPY_DMA,size); enable_dma(FLOPPY_DMA); return 0; diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 837ddddbac6a..42ffb60a6ea9 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -7,8 +7,6 @@ #define virt_to_phys(a) ((unsigned long)__pa(a)) #define phys_to_virt(a) __va(a) -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt static inline unsigned long isa_bus_to_virt(unsigned long addr) { BUG(); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c2ce2e60c8f0..ae8ab95c4389 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -276,7 +276,6 @@ config PPC select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT - select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. # diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index c5a5f7c9b231..73fcd5cdb662 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -985,8 +985,6 @@ static inline void * bus_to_virt(unsigned long address) } #define bus_to_virt bus_to_virt -#define page_to_bus(page) (page_to_phys(page) + PCI_DRAM_OFFSET) - #endif /* CONFIG_PPC32 */ /* access ports */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 1526e410e802..ac70b0fd9a9a 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -167,7 +167,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) #define page_to_phys(page) (pfn_to_phys(page_to_pfn(page))) -#define page_to_bus(page) (page_to_phys(page)) #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be0b95e51df6..9a81b867c3bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -278,7 +278,6 @@ config X86 select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select USER_STACKTRACE_SUPPORT - select VIRT_TO_BUS select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 1870b99c3356..e9025640f634 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -169,15 +169,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) } #define isa_bus_to_virt phys_to_virt -/* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - /* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 0b0f0172cced..92a24ed738a5 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -50,7 +50,6 @@ config XTENSA select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC select TRACE_IRQFLAGS_SUPPORT - select VIRT_TO_BUS help Xtensa processors are 32-bit RISC machines designed by Tensilica primarily for embedded systems. These processors are both diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index 54188e69b988..a5b707e1c0f4 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -63,9 +63,6 @@ static inline void iounmap(volatile void __iomem *addr) xtensa_iounmap(addr); } -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - #endif /* CONFIG_MMU */ #include diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 9c5114335a2d..7197e0642757 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1142,20 +1142,6 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) } #endif -#ifdef CONFIG_VIRT_TO_BUS -#ifndef virt_to_bus -static inline unsigned long virt_to_bus(void *address) -{ - return (unsigned long)address; -} - -static inline void *bus_to_virt(unsigned long address) -{ - return (void *)address; -} -#endif -#endif - #ifndef memset_io #define memset_io memset_io /** diff --git a/mm/Kconfig b/mm/Kconfig index 169e64192e48..b7a44b17c79f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,14 +639,6 @@ config BOUNCE memory available to the CPU. Enabled by default when HIGHMEM is selected, but you may say n to override this. -config VIRT_TO_BUS - bool - help - An architecture should select this if it implements the - deprecated interface virt_to_bus(). All new architectures - should probably not select this. - - config MMU_NOTIFIER bool select SRCU -- cgit v1.2.3 From 46d60bdb1283bb0f22d9480e2d6c972623cb4182 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 6 May 2022 11:14:24 +0200 Subject: powerpc: Include asm/firmware.h in all users of firmware_has_feature() Trying to remove asm/ppc_asm.h from all places that don't need it leads to several failures linked to firmware_has_feature(). To fix it, include asm/firmware.h in all files using firmware_has_feature() All users found with: git grep -L "firmware\.h" ` git grep -l "firmware_has_feature("` Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/11956ec181a034b51a881ac9c059eea72c679a73.1651828453.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 3 +++ arch/powerpc/include/asm/cputime.h | 1 + arch/powerpc/include/asm/interrupt.h | 1 + arch/powerpc/include/asm/mman.h | 1 + arch/powerpc/include/asm/prom.h | 1 + arch/powerpc/kernel/dawr.c | 1 + arch/powerpc/kexec/core.c | 1 + arch/powerpc/kvm/book3s_64_mmu_radix.c | 1 + arch/powerpc/kvm/book3s_hv_nested.c | 1 + arch/powerpc/mm/book3s64/hash_pgtable.c | 1 + arch/powerpc/mm/book3s64/pkeys.c | 1 + arch/powerpc/mm/hugetlbpage.c | 1 + arch/powerpc/platforms/pseries/papr_platform_attributes.c | 1 + arch/powerpc/platforms/pseries/vas.c | 1 + 14 files changed, 16 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index b37a28f62cf6..aa1c67c8bfc8 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_H #define _ASM_POWERPC_BOOK3S_64_HUGETLB_H + +#include + /* * For radix we want generic code to handle hugetlb. But then if we want * both hash and radix to be enabled together we need to workaround the diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 504f7fe6711a..6d2b27997492 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -19,6 +19,7 @@ #include #include #include +#include typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index b14f54d789d2..8069dbc4b8d1 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 1b024e64c8ec..17a77d47ed6d 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -12,6 +12,7 @@ #include #include #include +#include static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, unsigned long pkey) diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 5c80152e8f18..6f109b5cb84e 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -14,6 +14,7 @@ #include #include #include +#include /* These includes should be removed once implicit includes are cleaned up. */ #include diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index 30d4eca88d17..909a05cd2809 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -11,6 +11,7 @@ #include #include #include +#include bool dawr_force_enable; EXPORT_SYMBOL_GPL(dawr_force_enable); diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 9773dd0640f3..cf84bfe9e27e 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -20,6 +20,7 @@ #include #include #include +#include void machine_kexec_mask_interrupts(void) { unsigned int i; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 42851c32ff3b..9d4b3feda3b6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Supported radix tree geometry. diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 0644732d1a25..be8249cc6107 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct patb_entry *pseries_partition_tb; diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 2e0cad5817ba..ae008b9df0e6 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -13,6 +13,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 753e62ba67af..1d2675ab6711 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b282af39fcf6..bc84a594ca62 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,7 @@ #include #include #include +#include bool hugetlb_disabled = false; diff --git a/arch/powerpc/platforms/pseries/papr_platform_attributes.c b/arch/powerpc/platforms/pseries/papr_platform_attributes.c index 515150417bb3..526c621b098b 100644 --- a/arch/powerpc/platforms/pseries/papr_platform_attributes.c +++ b/arch/powerpc/platforms/pseries/papr_platform_attributes.c @@ -22,6 +22,7 @@ #include #include +#include #include "pseries.h" diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 500a1fc4a1d7..91e7eda0606c 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "vas.h" -- cgit v1.2.3 From e93dee186fc95f2058b0c9d2317d8b876b8512db Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 6 May 2022 11:14:25 +0200 Subject: powerpc: Don't include asm/ppc_asm.h in other headers asm/ppc_asm.h is not needed in any of the header it is included. It is only needed by irq.c. Include it there and remove it from other headers. word-at-a-time.h only need ex_table.h, so include it instead. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2d7b96547037f852c7ed164e4f79e8918c2607a.1651828453.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/io.h | 1 - arch/powerpc/include/asm/uaccess.h | 1 - arch/powerpc/include/asm/word-at-a-time.h | 2 +- arch/powerpc/kernel/irq.c | 1 + 4 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index c5a5f7c9b231..9b07133c3bd1 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -33,7 +33,6 @@ extern struct pci_dev *isa_bridge_pcidev; #include #include #include -#include #define SIO_CONFIG_RA 0x398 #define SIO_CONFIG_RD 0x399 diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 9b82b38ff867..14a08806f8e8 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -2,7 +2,6 @@ #ifndef _ARCH_POWERPC_UACCESS_H #define _ARCH_POWERPC_UACCESS_H -#include #include #include #include diff --git a/arch/powerpc/include/asm/word-at-a-time.h b/arch/powerpc/include/asm/word-at-a-time.h index f3f4710d4ff5..46c31fb8748d 100644 --- a/arch/powerpc/include/asm/word-at-a-time.h +++ b/arch/powerpc/include/asm/word-at-a-time.h @@ -7,7 +7,7 @@ #include #include -#include +#include #ifdef __BIG_ENDIAN__ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index dd09919c3c66..4db27198b002 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -65,6 +65,7 @@ #include #include #include +#include #ifdef CONFIG_PPC64 #include -- cgit v1.2.3 From ef5b570d3700fbb8628a58da0487486ceeb713cd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 May 2022 10:32:27 +0200 Subject: powerpc/irq: Don't open code irq_soft_mask helpers Use READ_ONCE() and WRITE_ONCE() instead of open coding read and write of local PACA irq_soft_mask. For the write, add a barrier to keep the memory clobber that was there previously. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2454434992cc932a5a34b695ae981c0b2f4c28e.1652862729.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 43 +++++++-------------------------------- 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 674e5aaafcbd..edc569481faf 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -113,14 +113,7 @@ static inline void __hard_RI_enable(void) static inline notrace unsigned long irq_soft_mask_return(void) { - unsigned long flags; - - asm volatile( - "lbz %0,%1(13)" - : "=r" (flags) - : "i" (offsetof(struct paca_struct, irq_soft_mask))); - - return flags; + return READ_ONCE(local_paca->irq_soft_mask); } /* @@ -148,46 +141,24 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) WARN_ON(mask && !(mask & IRQS_DISABLED)); #endif - asm volatile( - "stb %0,%1(13)" - : - : "r" (mask), - "i" (offsetof(struct paca_struct, irq_soft_mask)) - : "memory"); + WRITE_ONCE(local_paca->irq_soft_mask, mask); + barrier(); } static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) { - unsigned long flags; - -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON(mask && !(mask & IRQS_DISABLED)); -#endif + unsigned long flags = irq_soft_mask_return(); - asm volatile( - "lbz %0,%1(13); stb %2,%1(13)" - : "=&r" (flags) - : "i" (offsetof(struct paca_struct, irq_soft_mask)), - "r" (mask) - : "memory"); + irq_soft_mask_set(mask); return flags; } static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) { - unsigned long flags, tmp; - - asm volatile( - "lbz %0,%2(13); or %1,%0,%3; stb %1,%2(13)" - : "=&r" (flags), "=r" (tmp) - : "i" (offsetof(struct paca_struct, irq_soft_mask)), - "r" (mask) - : "memory"); + unsigned long flags = irq_soft_mask_return(); -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON((mask | flags) && !((mask | flags) & IRQS_DISABLED)); -#endif + irq_soft_mask_set(flags | mask); return flags; } -- cgit v1.2.3 From 78ffe6a7e2a169c4dcbbd08717a0a8d738659d15 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 May 2022 10:32:28 +0200 Subject: powerpc/irq: Replace #ifdefs by IS_ENABLED() Replace #ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG and #ifdef CONFIG_PERF_EVENTS by IS_ENABLED() in hw_irq.h and plpar_wrappers.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c1ded642f8d9002767f8fed48ed6d1e76254ed73.1652862729.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 30 ++++++++++++++---------------- arch/powerpc/include/asm/plpar_wrappers.h | 5 ++--- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index edc569481faf..6efab00aa1c8 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -123,7 +123,6 @@ static inline notrace unsigned long irq_soft_mask_return(void) */ static inline notrace void irq_soft_mask_set(unsigned long mask) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG /* * The irq mask must always include the STD bit if any are set. * @@ -138,8 +137,8 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) * unmasks to be replayed, among other things. For now, take * the simple approach. */ - WARN_ON(mask && !(mask & IRQS_DISABLED)); -#endif + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON(mask && !(mask & IRQS_DISABLED)); WRITE_ONCE(local_paca->irq_soft_mask, mask); barrier(); @@ -324,11 +323,13 @@ bool power_pmu_wants_prompt_pmi(void); */ static inline bool should_hard_irq_enable(void) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); - WARN_ON(mfmsr() & MSR_EE); -#endif -#ifdef CONFIG_PERF_EVENTS + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(mfmsr() & MSR_EE); + } + + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return false; /* * If the PMU is not running, there is not much reason to enable * MSR[EE] in irq handlers because any interrupts would just be @@ -343,9 +344,6 @@ static inline bool should_hard_irq_enable(void) return false; return true; -#else - return false; -#endif } /* @@ -353,11 +351,11 @@ static inline bool should_hard_irq_enable(void) */ static inline void do_hard_irq_enable(void) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); - WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); - WARN_ON(mfmsr() & MSR_EE); -#endif + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); + WARN_ON(mfmsr() & MSR_EE); + } /* * This allows PMI interrupts (and watchdog soft-NMIs) through. * There is no other reason to enable this way. diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 83e0f701ebc6..8239c0af5eb2 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -43,11 +43,10 @@ static inline long extended_cede_processor(unsigned long latency_hint) set_cede_latency_hint(latency_hint); rc = cede_processor(); -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + /* Ensure that H_CEDE returns with IRQs on */ - if (WARN_ON(!(mfmsr() & MSR_EE))) + if (WARN_ON(IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && !(mfmsr() & MSR_EE))) __hard_irq_enable(); -#endif set_cede_latency_hint(old_latency_hint); -- cgit v1.2.3 From 077fc62b2b66a95af43dbb363fb8e932999812d3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 May 2022 10:48:55 +0200 Subject: powerpc/irq: remove inline assembly in hard_irq_disable macro Use WRITE_ONCE() instead of opencoding the saving of current stack pointeur. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9f05937d8722ddd2064a7c2362d8f9000e15e1ba.1652863723.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 6efab00aa1c8..26ede09c521d 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -282,9 +282,7 @@ static inline bool pmi_irq_pending(void) flags = irq_soft_mask_set_return(IRQS_ALL_DISABLED); \ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; \ if (!arch_irqs_disabled_flags(flags)) { \ - asm ("stdx %%r1, 0, %1 ;" \ - : "=m" (local_paca->saved_r1) \ - : "b" (&local_paca->saved_r1)); \ + WRITE_ONCE(local_paca->saved_r1, current_stack_pointer);\ trace_hardirqs_off(); \ } \ } while(0) -- cgit v1.2.3 From 051bd351a2ef9c69753dc9cf6bd396986f27778c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 9 Jun 2022 12:16:40 +0200 Subject: powerpc/irq: Make __do_irq() static Since commit 48cf12d88969 ("powerpc/irq: Inline call_do_irq() and call_do_softirq()"), __do_irq() is not used outside irq.c Reorder functions and make __do_irq() static and drop the declaration in irq.h. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/adbe1c8315ec2d63259f41468e82e51677bb1eda.1654769775.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/irq.h | 1 - arch/powerpc/kernel/irq.c | 46 +++++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 13f0409dd617..5c1516a5ba8f 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -54,7 +54,6 @@ extern void *softirq_ctx[NR_CPUS]; void __do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); -extern void __do_irq(struct pt_regs *regs); int irq_choose_cpu(const struct cpumask *mask); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index bf7078b8a765..037db84026a1 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -220,31 +220,9 @@ static __always_inline void call_do_softirq(const void *sp) ); } -static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) -{ - register unsigned long r3 asm("r3") = (unsigned long)regs; - - /* Temporarily switch r1 to sp, call __do_irq() then restore r1. */ - asm volatile ( - PPC_STLU " %%r1, %[offset](%[sp]) ;" - "mr %%r1, %[sp] ;" - "bl %[callee] ;" - PPC_LL " %%r1, 0(%%r1) ;" - : // Outputs - "+r" (r3) - : // Inputs - [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), - [callee] "i" (__do_irq) - : // Clobbers - "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", - "cr7", "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", - "r11", "r12" - ); -} - DEFINE_STATIC_CALL_RET0(ppc_get_irq, *ppc_md.get_irq); -void __do_irq(struct pt_regs *regs) +static void __do_irq(struct pt_regs *regs) { unsigned int irq; @@ -270,6 +248,28 @@ void __do_irq(struct pt_regs *regs) trace_irq_exit(regs); } +static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) +{ + register unsigned long r3 asm("r3") = (unsigned long)regs; + + /* Temporarily switch r1 to sp, call __do_irq() then restore r1. */ + asm volatile ( + PPC_STLU " %%r1, %[offset](%[sp]) ;" + "mr %%r1, %[sp] ;" + "bl %[callee] ;" + PPC_LL " %%r1, 0(%%r1) ;" + : // Outputs + "+r" (r3) + : // Inputs + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), + [callee] "i" (__do_irq) + : // Clobbers + "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", + "cr7", "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12" + ); +} + void __do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); -- cgit v1.2.3 From 12a9eddd239e14ccbf0ea50b3504a21bda002ba9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Jun 2022 08:21:22 +0200 Subject: powerpc: Remove _PAGE_SAO stub for book3e/64 Since commit 634093c59a12 ("powerpc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT"), _PAGE_SAO is used only in arch/powerpc/mm/book3s64/pgtable.c The _PAGE_SAO stub defined as 0 for book3e/64 can be removed. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/715e644fb3c7d992c0b71f6165ab6cf8c682055a.1655706069.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 57083f95e82b..25b72b2b30ce 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -83,8 +83,6 @@ */ #include -#define _PAGE_SAO 0 - #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) /* -- cgit v1.2.3 From 2db2008e636327a3caa648ef1e34a9d501d20e2e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 23 Jun 2022 10:56:57 +0200 Subject: powerpc/64e: Rewrite p4d_populate() as a static inline function Rewrite p4d_populate() as a static inline function instead of a macro. This change allows typechecking and would have helped detecting a recently found bug in map_kernel_page(). Signed-off-by: Christophe Leroy Acked-by: Mike Rapoport Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1b416f8a8fe1bc3f4e01175680ce310b7eb3a1e4.1655974565.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgalloc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 668aee6017e7..e50b211becb3 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -15,7 +15,10 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -#define p4d_populate(MM, P4D, PUD) p4d_set(P4D, (unsigned long)PUD) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + p4d_set(p4d, (unsigned long)pud); +} static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3 From 3adfb457b84bd6de4e78a99814038fbd7205f253 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 28 Jun 2022 16:48:55 +0200 Subject: powerpc/64e: Remove MMU_FTR_USE_TLBRSRV and MMU_FTR_USE_PAIRED_MAS Commit fb5a515704d7 ("powerpc: Remove platforms/wsp and associated pieces") removed the last CPU having features MMU_FTRS_A2 and commit cd68098bcedd ("powerpc: Clean up MMU_FTRS_A2 and MMU_FTR_TYPE_3E") removed MMU_FTRS_A2 which was the last user of MMU_FTR_USE_TLBRSRV and MMU_FTR_USE_PAIRED_MAS. Remove all code that relies on MMU_FTR_USE_TLBRSRV and MMU_FTR_USE_PAIRED_MAS. With this change done, TLB miss can happen before the mmu feature fixups. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cfd5a0ecdb1598da968832e1bddf7431ec267200.1656427701.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/mmu.h | 12 ------ arch/powerpc/kernel/setup_64.c | 1 - arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 30 ++++--------- arch/powerpc/mm/nohash/tlb_low_64e.S | 66 ----------------------------- 4 files changed, 8 insertions(+), 101 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 5f41565a1e5d..860d0290ca4d 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -96,15 +96,6 @@ */ #define MMU_FTR_NEED_DTLB_SW_LRU ASM_CONST(0x00200000) -/* Enable use of TLB reservation. Processor should support tlbsrx. - * instruction and MAS0[WQ]. - */ -#define MMU_FTR_USE_TLBRSRV ASM_CONST(0x00800000) - -/* Use paired MAS registers (MAS7||MAS3, etc.) - */ -#define MMU_FTR_USE_PAIRED_MAS ASM_CONST(0x01000000) - /* Doesn't support the B bit (1T segment) in SLBIE */ #define MMU_FTR_NO_SLBIE_B ASM_CONST(0x02000000) @@ -180,9 +171,6 @@ enum { #ifdef CONFIG_PPC_83xx MMU_FTR_NEED_DTLB_SW_LRU | #endif -#ifdef CONFIG_PPC_BOOK3E_64 - MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | -#endif #ifdef CONFIG_PPC_BOOK3S_64 MMU_FTR_KERNEL_RO | #ifdef CONFIG_PPC_64S_HASH_MMU diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 5761f08dae95..2b2d0b0fbb30 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -113,7 +113,6 @@ void __init setup_tlb_core_data(void) * Should we panic instead? */ WARN_ONCE(smt_enabled_at_boot >= 2 && - !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && book3e_htw_mode != PPC_HTW_E6500, "%s: unsupported MMU configuration\n", __func__); } diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index 307ca919d393..c7d4b317a823 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -103,21 +103,11 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) int found = 0; mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); return found; } @@ -169,13 +159,9 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) mtspr(SPRN_MAS1, mas1); mtspr(SPRN_MAS2, mas2); - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); asm volatile ("tlbwe"); diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9e9ab3803fb2..a59485c549a7 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -152,16 +152,7 @@ tlb_miss_common_bolted: clrrdi r15,r15,3 beq tlb_miss_fault_bolted /* No PGDIR, bail */ -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ldx r14,r14,r15 /* grab pgd entry */ - beq tlb_miss_done_bolted /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -674,16 +665,7 @@ normal_tlb_miss: clrrdi r14,r14,3 or r10,r15,r14 -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE - ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ @@ -727,13 +709,9 @@ finish_normal_tlb_miss: li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 1: -BEGIN_MMU_FTR_SECTION srdi r16,r15,32 mtspr SPRN_MAS3,r15 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -809,13 +787,6 @@ virt_page_table_tlb_miss: #else 1: #endif -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Now, we need to walk the page tables. First check if we are in * range. @@ -866,41 +837,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overridden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_EPILOG_SUCCESS rfi @@ -1115,13 +1057,9 @@ htw_tlb_miss: */ ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -1202,13 +1140,9 @@ tlb_load_linear: clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -- cgit v1.2.3 From b646c1f7f43c13510d519e3044c87aa32352fc1f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 28 Jun 2022 16:48:56 +0200 Subject: powerpc/64e: Remove unused REGION related macros Those macros are not used anywhere. Remove them as they are soon going to be wrong and are not worth modifying as they are not used. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f0efde8cee0924c3991790042b176ac77ad35e1f.1656427701.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgtable.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 25b72b2b30ce..d29f1d65ef05 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -57,18 +57,6 @@ #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE - FIXADDR_SIZE) #define FIXADDR_SIZE SZ_32M - -/* - * Region IDs - */ -#define REGION_SHIFT 60UL -#define REGION_MASK (0xfUL << REGION_SHIFT) -#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) - -#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) -#define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define USER_REGION_ID (0UL) - /* * Defines the address of the vmemap area, in its own region on * after the vmalloc space on Book3E -- cgit v1.2.3 From 128c1ea2f838d3031a1c475607860e4271a8e9dc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 28 Jun 2022 16:48:57 +0200 Subject: powerpc/64e: Move virtual memory closer to linear memory Today nohash/64 have linear memory based at 0xc000000000000000 and virtual memory based at 0x8000000000000000. In order to implement KASAN, we need to regroup both areas. Move virtual memmory at 0xc000100000000000. This complicates a bit TLB miss handlers. Until now, memory region was easily identified with the 4 higher bits of address: - 0 ==> User - c ==> Linear Memory - 8 ==> Virtual Memory Now we need to rely on the 20 higher bits, with: - 0xxxx ==> User - c0000 ==> Linear Memory - c0001 ==> Virtual Memory Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4b225168031449fc34fc7132f3923cc8dc54af60.1656427701.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/powerpc/mm/nohash/tlb_low_64e.S | 64 ++++++++++++++++------------ 2 files changed, 38 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index d29f1d65ef05..9104cc4121b2 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -25,7 +25,7 @@ /* * Define the address range of the kernel non-linear virtual area */ -#define KERN_VIRT_START ASM_CONST(0x8000000000000000) +#define KERN_VIRT_START ASM_CONST(0xc000100000000000) #define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) /* diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index a59485c549a7..68ffbfdba894 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -213,10 +213,11 @@ itlb_miss_kernel_bolted: tlb_miss_kernel_bolted: mfspr r10,SPRN_MAS1 ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + andi. r15,r15,1 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted + bne+ tlb_miss_common_bolted tlb_miss_fault_bolted: /* We need to check if it was an instruction miss */ @@ -498,7 +499,9 @@ tlb_miss_huge_e6500: tlb_miss_kernel_e6500: ld r14,PACA_KERNELPGD(r13) - cmpldi cr1,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + xoris r15,r15,0xc /* Check for vmalloc region */ + cmplwi cr1,r15,1 beq+ cr1,tlb_miss_common_e6500 tlb_miss_fault_e6500: @@ -532,16 +535,18 @@ itlb_miss_fault_e6500: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* The page tables are mapped virtually linear. At this point, though, * we don't know whether we are trying to fault in a first level * virtual address or a virtual page table address. We can get that * from bit 0x1 of the region ID which we have set for a page table */ - andi. r10,r15,0x1 + andis. r10,r15,0x1 bne- virt_page_table_tlb_miss std r14,EX_TLB_ESR(r12); /* save ESR */ @@ -553,7 +558,7 @@ itlb_miss_fault_e6500: /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ /* We pre-test some combination of permissions to avoid double * faults: @@ -577,10 +582,9 @@ itlb_miss_fault_e6500: beq normal_tlb_miss_user /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -606,16 +610,18 @@ itlb_miss_fault_e6500: * * Faulting address is SRR0 which is already in r16 */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ beq normal_tlb_miss_user @@ -623,10 +629,9 @@ itlb_miss_fault_e6500: oris r11,r11,_PAGE_ACCESSED@h /* XXX replace the RMW cycles with immediate loads + writes */ mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault */ TLB_MISS_EPILOG_ERROR @@ -659,10 +664,11 @@ normal_tlb_miss: * NOTE: For 64K pages, we do things slightly differently in * order to handle the weird page table format used by linux */ - ori r10,r15,0x1 + srdi r15,r16,44 + oris r10,r15,0x1 rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 - sldi r15,r10,60 - clrrdi r14,r14,3 + sldi r15,r10,44 + clrrdi r14,r14,19 or r10,r15,r14 ld r14,0(r10) @@ -763,6 +769,7 @@ normal_tlb_miss_access_fault: */ virt_page_table_tlb_miss: /* Are we hitting a kernel page table ? */ + srdi r15,r16,60 andi. r10,r15,0x8 /* The cool thing now is that r10 contains 0 for user and 8 for kernel, @@ -791,7 +798,8 @@ virt_page_table_tlb_miss: /* Now, we need to walk the page tables. First check if we are in * range. */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + cmpldi r10,0x80 bne- virt_page_table_tlb_miss_fault /* Get the PGD pointer */ @@ -910,23 +918,24 @@ virt_page_table_tlb_miss_whacko_fault: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss + beq+ cr1,htw_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -952,19 +961,20 @@ virt_page_table_tlb_miss_whacko_fault: * * Faulting address is SRR0 which is already in r16 */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ -- cgit v1.2.3 From 059c189389ebe9c4909d849d1a5f65c53115ca19 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 28 Jun 2022 16:48:58 +0200 Subject: powerpc/64e: Reorganise virtual memory Reduce the size of IO map in order to leave the last quarter of virtual MAP for KASAN shadow mapping. This gives the following layout. +------------------------+ Kernel virtual map end (0xc000200000000000) | | | 16TB (unused) | | | +------------------------+ Kernel IO map end | | | 16TB of IO map | | | +------------------------+ Kernel IO map start | | | 16TB of vmemmap | | | +------------------------+ Kernel vmemmap start | | | 16TB of vmap | | | +------------------------+ Kernel virt start (0xc000100000000000) | | | 64TB of linear mem | | | +------------------------+ Kernel linear (0xc.....) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/54ef01673bf14228106afd629f795c83acb9a00c.1656427701.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgtable.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 9104cc4121b2..599921cc257e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -38,15 +38,16 @@ #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* - * The second half of the kernel virtual space is used for IO mappings, + * The third quarter of the kernel virtual space is used for IO mappings, * it's itself carved into the PIO region (ISA and PHB IO space) and * the ioremap space * * ISA_IO_BASE = KERN_IO_START, 64K reserved area * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces - * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE + * IOREMAP_BASE = ISA_IO_BASE + 2G to KERN_IO_START + KERN_IO_SIZE */ #define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) +#define KERN_IO_SIZE (KERN_VIRT_SIZE >> 2) #define FULL_IO_SIZE 0x80000000ul #define ISA_IO_BASE (KERN_IO_START) #define ISA_IO_END (KERN_IO_START + 0x10000ul) @@ -54,7 +55,7 @@ #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_START (ioremap_bot) -#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE - FIXADDR_SIZE) +#define IOREMAP_END (KERN_IO_START + KERN_IO_SIZE - FIXADDR_SIZE) #define FIXADDR_SIZE SZ_32M /* -- cgit v1.2.3 From c7b9ed7c34a9f5dbf8222d63e3e313cef9f3150b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 28 Jun 2022 16:48:59 +0200 Subject: powerpc/64e: KASAN Full support for BOOK3E/64 We now have memory organised in a way that allows implementing KASAN. Unlike book3s/64, book3e always has translation active so the only thing needed to use KASAN is to setup an early zero shadow mapping just after setting a stack pointer and before calling early_setup(). The memory layout is now as follows +------------------------+ Kernel virtual map end (0xc000200000000000) | | | 16TB of KASAN map | | | +------------------------+ Kernel KASAN shadow map start | | | 16TB of IO map | | | +------------------------+ Kernel IO map start | | | 16TB of vmemmap | | | +------------------------+ Kernel vmemmap start | | | 16TB of vmap | | | +------------------------+ Kernel virt start (0xc000100000000000) | | | 64TB of linear mem | | | +------------------------+ Kernel linear (0xc.....) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0bef8beda27baf71e3b9e8b13e620fba6e19499b.1656427701.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 3 +- arch/powerpc/include/asm/kasan.h | 13 +++- arch/powerpc/kernel/head_64.S | 3 + arch/powerpc/mm/kasan/Makefile | 1 + arch/powerpc/mm/kasan/init_book3e_64.c | 133 +++++++++++++++++++++++++++++++++ arch/powerpc/mm/kasan/init_book3s_64.c | 2 + arch/powerpc/platforms/Kconfig.cputype | 1 - 8 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/mm/kasan/init_book3e_64.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c2ce2e60c8f0..92e0cad7752d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -193,6 +193,7 @@ config PPC select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 select HAVE_ARCH_KASAN if PPC_RADIX_MMU + select HAVE_ARCH_KASAN if PPC_BOOK3E_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_KFENCE if PPC_BOOK3S_32 || PPC_8xx || 40x select HAVE_ARCH_KGDB @@ -254,6 +255,7 @@ config PPC select IOMMU_HELPER if PPC64 select IRQ_DOMAIN select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN && MODULES select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 9f363c143d86..6a8855d45af7 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -375,4 +375,5 @@ config KASAN_SHADOW_OFFSET hex depends on KASAN default 0xe0000000 if PPC32 - default 0xa80e000000000000 if PPC64 + default 0xa80e000000000000 if PPC_BOOK3S_64 + default 0xa8001c0000000000 if PPC_BOOK3E_64 diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index a6be4025cba2..92a968202ba7 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -19,7 +19,7 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 -#ifdef CONFIG_MODULES +#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32) #define KASAN_KERN_START ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M) #else #define KASAN_KERN_START PAGE_OFFSET @@ -39,6 +39,17 @@ * c00e000000000000 << 3 + a80e000000000000 = c00fc00000000000 */ #define KASAN_SHADOW_END 0xc00fc00000000000UL + +#else + +/* + * The shadow ends before the highest accessible address + * because we don't need a shadow for the shadow. + * But it doesn't hurt to have a shadow for the shadow, + * keep shadow end aligned eases things. + */ +#define KASAN_SHADOW_END 0xc000200000000000UL + #endif #ifdef CONFIG_KASAN diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index d3eea633d11a..cf2c08902c05 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -965,6 +965,9 @@ start_here_multiplatform: * and SLB setup before we turn on relocation. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif /* Restore parameters passed from prom_init/kexec */ mr r3,r31 LOAD_REG_ADDR(r12, DOTSYM(early_setup)) diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index 4999aadb1867..699eeffd9f55 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_PPC32) += init_32.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c new file mode 100644 index 000000000000..11519e88dc6b --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3e powerpc + * + * Copyright 2022, Christophe Leroy, CS GROUP France + */ + +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include + +#include + +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} + +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (kasan_pud_table(*p4dp)) { + pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (kasan_pmd_table(*pudp)) { + pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (kasan_pte_table(*pmdp)) { + ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + + __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0); + + return 0; +} + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_early_init(void) +{ + int i; + unsigned long addr; + pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START); + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE) + p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud); +} + +void __init kasan_init(void) +{ + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region((void *)start, (void *)end); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 0da5566d6b84..9300d641cf9a 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -99,4 +99,6 @@ void __init kasan_init(void) pr_info("KASAN init done\n"); } +void __init kasan_early_init(void) { } + void __init kasan_late_init(void) { } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9e2df4b66478..383ed4fe6013 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -2,7 +2,6 @@ config PPC32 bool default y if !PPC64 - select KASAN_VMALLOC if KASAN && MODULES config PPC64 bool "64-bit kernel" -- cgit v1.2.3 From c3fa64c99c61d99631a8e06a6cf991c35c98ec7d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 25 May 2022 10:05:52 -0300 Subject: KVM: PPC: Book3S HV: Decouple the debug timing from the P8 entry path We are currently doing the timing for debug purposes of the P9 entry path using the accumulators and terminology defined by the old entry path for P8 machines. Not only the "real-mode" and "napping" mentions are out of place for the P9 Radix entry path but also we cannot change them because the timing code is coupled to the structures defined in struct kvm_vcpu_arch. Add a new CONFIG_KVM_BOOK3S_HV_P9_TIMING to enable the timing code for the P9 entry path. For now, just add the new CONFIG and duplicate the structures. A subsequent patch will add the P9 changes. Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220525130554.2614394-4-farosas@linux.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 8 ++++++++ arch/powerpc/kvm/Kconfig | 14 +++++++++++++- arch/powerpc/kvm/book3s_hv.c | 13 +++++++++++-- arch/powerpc/kvm/book3s_hv_p9_entry.c | 2 +- 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2909a88acd16..a0f44762a069 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -830,11 +830,19 @@ struct kvm_vcpu_arch { #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ u64 cur_tb_start; /* when it started */ +#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ struct kvmhv_tb_accumulator guest_time; /* guest execution */ struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ +#else + struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ + struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ + struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ + struct kvmhv_tb_accumulator guest_time; /* guest execution */ + struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ +#endif #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ }; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 73f8277df7d1..191347f44731 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -130,10 +130,22 @@ config KVM_BOOK3S_64_PR config KVM_BOOK3S_HV_EXIT_TIMING bool +config KVM_BOOK3S_HV_P9_TIMING + bool "Detailed timing for the P9 entry point" + select KVM_BOOK3S_HV_EXIT_TIMING + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + help + Calculate time taken for each vcpu in various parts of the + code. The total, minimum and maximum times in nanoseconds + together with the number of executions are reported in debugfs in + kvm/vm#/vcpu#/timings. + + If unsure, say N. + config KVM_BOOK3S_HV_P8_TIMING bool "Detailed timing for hypervisor real-mode code (for POWER8)" select KVM_BOOK3S_HV_EXIT_TIMING - depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS && !KVM_BOOK3S_HV_P9_TIMING help Calculate time taken for each vcpu in the real-mode guest entry, exit, and interrupt handling code, plus time spent in the guest diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e08fb3124dca..108ee563f72a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2660,11 +2660,19 @@ static struct debugfs_timings_element { const char *name; size_t offset; } timings[] = { +#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, +#else + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, +#endif }; #define N_TIMINGS (ARRAY_SIZE(timings)) @@ -2783,8 +2791,9 @@ static const struct file_operations debugfs_timings_ops = { /* Create a debugfs directory for the vcpu */ static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { - debugfs_create_file("timings", 0444, debugfs_dentry, vcpu, - &debugfs_timings_ops); + if (cpu_has_feature(CPU_FTR_ARCH_300) == IS_ENABLED(CONFIG_KVM_BOOK3S_HV_P9_TIMING)) + debugfs_create_file("timings", 0444, debugfs_dentry, vcpu, + &debugfs_timings_ops); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 7f88be386b27..8d8c0cf39bdf 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -437,7 +437,7 @@ void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(restore_p9_host_os_sprs); -#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next) { struct kvmppc_vcore *vc = vcpu->arch.vcore; -- cgit v1.2.3 From b44bb1b7cbbae66ec73868f5fbd57c54f0612d1c Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 25 May 2022 10:05:54 -0300 Subject: KVM: PPC: Book3S HV: Provide more detailed timings for P9 entry path Alter the data collection points for the debug timing code in the P9 path to be more in line with what the code does. The points where we accumulate time are now the following: vcpu_entry: From vcpu_run_hv entry until the start of the inner loop; guest_entry: From the start of the inner loop until the guest entry in asm; in_guest: From the guest entry in asm until the return to KVM C code; guest_exit: From the return into KVM C code until the corresponding hypercall/page fault handling or re-entry into the guest; hypercall: Time spent handling hcalls in the kernel (hcalls can go to QEMU, not accounted here); page_fault: Time spent handling page faults; vcpu_exit: vcpu_run_hv exit (almost no code here currently). Like before, these are exposed in debugfs in a file called "timings". There are four values: - number of occurrences of the accumulation point; - total time the vcpu spent in the phase in ns; - shortest time the vcpu spent in the phase in ns; - longest time the vcpu spent in the phase in ns; === Before: rm_entry: 53132 16793518 256 4060 rm_intr: 53132 2125914 22 340 rm_exit: 53132 24108344 374 2180 guest: 53132 40980507996 404 9997650 cede: 0 0 0 0 After: vcpu_entry: 34637 7716108 178 4416 guest_entry: 52414 49365608 324 747542 in_guest: 52411 40828715840 258 9997480 guest_exit: 52410 19681717182 826 102496674 vcpu_exit: 34636 1744462 38 182 hypercall: 45712 22878288 38 1307962 page_fault: 992 111104034 568 168688 With just one instruction (hcall): vcpu_entry: 1 942 942 942 guest_entry: 1 4044 4044 4044 in_guest: 1 1540 1540 1540 guest_exit: 1 3542 3542 3542 vcpu_exit: 1 80 80 80 hypercall: 0 0 0 0 page_fault: 0 0 0 0 === Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220525130554.2614394-6-farosas@linux.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 12 +++++++----- arch/powerpc/kvm/Kconfig | 9 +++++---- arch/powerpc/kvm/book3s_hv.c | 23 ++++++++++++++++++----- arch/powerpc/kvm/book3s_hv_p9_entry.c | 14 ++++---------- 4 files changed, 34 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a0f44762a069..eeba679802cf 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -831,11 +831,13 @@ struct kvm_vcpu_arch { struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ u64 cur_tb_start; /* when it started */ #ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING - struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ - struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ - struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ - struct kvmhv_tb_accumulator guest_time; /* guest execution */ - struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ + struct kvmhv_tb_accumulator vcpu_entry; + struct kvmhv_tb_accumulator vcpu_exit; + struct kvmhv_tb_accumulator in_guest; + struct kvmhv_tb_accumulator hcall; + struct kvmhv_tb_accumulator pg_fault; + struct kvmhv_tb_accumulator guest_entry; + struct kvmhv_tb_accumulator guest_exit; #else struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 191347f44731..cedf1e0f50e1 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -135,10 +135,11 @@ config KVM_BOOK3S_HV_P9_TIMING select KVM_BOOK3S_HV_EXIT_TIMING depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS help - Calculate time taken for each vcpu in various parts of the - code. The total, minimum and maximum times in nanoseconds - together with the number of executions are reported in debugfs in - kvm/vm#/vcpu#/timings. + Calculate time taken for each vcpu during vcpu entry and + exit, time spent inside the guest and time spent handling + hypercalls and page faults. The total, minimum and maximum + times in nanoseconds together with the number of executions + are reported in debugfs in kvm/vm#/vcpu#/timings. If unsure, say N. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 108ee563f72a..c68883170a82 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2661,11 +2661,13 @@ static struct debugfs_timings_element { size_t offset; } timings[] = { #ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING - {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, - {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, - {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, - {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, - {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, + {"vcpu_entry", offsetof(struct kvm_vcpu, arch.vcpu_entry)}, + {"guest_entry", offsetof(struct kvm_vcpu, arch.guest_entry)}, + {"in_guest", offsetof(struct kvm_vcpu, arch.in_guest)}, + {"guest_exit", offsetof(struct kvm_vcpu, arch.guest_exit)}, + {"vcpu_exit", offsetof(struct kvm_vcpu, arch.vcpu_exit)}, + {"hypercall", offsetof(struct kvm_vcpu, arch.hcall)}, + {"page_fault", offsetof(struct kvm_vcpu, arch.pg_fault)}, #else {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, @@ -4014,8 +4016,10 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns mtspr(SPRN_DAR, vcpu->arch.shregs.dar); mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); switch_pmu_to_guest(vcpu, &host_os_sprs); + accumulate_time(vcpu, &vcpu->arch.in_guest); trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), __pa(&vcpu->arch.regs)); + accumulate_time(vcpu, &vcpu->arch.guest_exit); kvmhv_restore_hv_return_state(vcpu, &hvregs); switch_pmu_to_host(vcpu, &host_os_sprs); vcpu->arch.shregs.msr = vcpu->arch.regs.msr; @@ -4703,6 +4707,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) struct kvm *kvm; unsigned long msr; + start_timing(vcpu, &vcpu->arch.vcpu_entry); + if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; @@ -4768,6 +4774,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { + accumulate_time(vcpu, &vcpu->arch.guest_entry); if (cpu_has_feature(CPU_FTR_ARCH_300)) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); @@ -4775,6 +4782,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) r = kvmppc_run_vcpu(vcpu); if (run->exit_reason == KVM_EXIT_PAPR_HCALL) { + accumulate_time(vcpu, &vcpu->arch.hcall); + if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) { /* * These should have been caught reflected @@ -4790,6 +4799,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) trace_kvm_hcall_exit(vcpu, r); kvmppc_core_prepare_to_enter(vcpu); } else if (r == RESUME_PAGE_FAULT) { + accumulate_time(vcpu, &vcpu->arch.pg_fault); srcu_idx = srcu_read_lock(&kvm->srcu); r = kvmppc_book3s_hv_page_fault(vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); @@ -4801,12 +4811,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) r = kvmppc_xics_rm_complete(vcpu, 0); } } while (is_kvmppc_resume_guest(r)); + accumulate_time(vcpu, &vcpu->arch.vcpu_exit); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&kvm->arch.vcpus_running); srr_regs_clobbered(); + end_timing(vcpu); + return r; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 32e078db8da2..e740eca45862 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -779,8 +779,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV); WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME)); - start_timing(vcpu, &vcpu->arch.rm_entry); - vcpu->arch.ceded = 0; /* Save MSR for restore, with EE clear. */ @@ -941,13 +939,13 @@ tm_return_to_guest: mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); - accumulate_time(vcpu, &vcpu->arch.guest_time); - switch_pmu_to_guest(vcpu, &host_os_sprs); + accumulate_time(vcpu, &vcpu->arch.in_guest); + kvmppc_p9_enter_guest(vcpu); - switch_pmu_to_host(vcpu, &host_os_sprs); - accumulate_time(vcpu, &vcpu->arch.rm_intr); + accumulate_time(vcpu, &vcpu->arch.guest_exit); + switch_pmu_to_host(vcpu, &host_os_sprs); /* XXX: Could get these from r11/12 and paca exsave instead */ vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0); @@ -1042,8 +1040,6 @@ tm_return_to_guest: #endif } - accumulate_time(vcpu, &vcpu->arch.rm_exit); - /* Advance host PURR/SPURR by the amount used by guest */ purr = mfspr(SPRN_PURR); spurr = mfspr(SPRN_SPURR); @@ -1150,8 +1146,6 @@ tm_return_to_guest: asm volatile(PPC_CP_ABORT); out: - end_timing(vcpu); - return trap; } EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9); -- cgit v1.2.3 From 0df01238b8aa300cbc736e7ec433d201a76036f3 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 14 Jun 2022 13:52:04 -0300 Subject: KVM: PPC: Book3S HV: tracing: Add missing hcall names The kvm_trace_symbol_hcall macro is missing several of the hypercalls defined in hvcall.h. Add the most common ones that are issued during guest lifetime, including the ones that are only used by QEMU and SLOF. Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220614165204.549229-1-farosas@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 8 ++++++++ arch/powerpc/kvm/trace_hv.h | 21 ++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index d92a20a85395..1d454c70e7c6 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -350,6 +350,14 @@ /* Platform specific hcalls, used by KVM */ #define H_RTAS 0xf000 +/* + * Platform specific hcalls, used by QEMU/SLOF. These are ignored by + * KVM and only kept here so we can identify them during tracing. + */ +#define H_LOGICAL_MEMOP 0xF001 +#define H_CAS 0XF002 +#define H_UPDATE_DT 0XF003 + /* "Platform specific hcalls", provided by PHYP */ #define H_GET_24X7_CATALOG_PAGE 0xF078 #define H_GET_24X7_DATA 0xF07C diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 32e2cb5811cc..8d57c8428531 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -94,6 +94,7 @@ {H_GET_HCA_INFO, "H_GET_HCA_INFO"}, \ {H_GET_PERF_COUNT, "H_GET_PERF_COUNT"}, \ {H_MANAGE_TRACE, "H_MANAGE_TRACE"}, \ + {H_GET_CPU_CHARACTERISTICS, "H_GET_CPU_CHARACTERISTICS"}, \ {H_FREE_LOGICAL_LAN_BUFFER, "H_FREE_LOGICAL_LAN_BUFFER"}, \ {H_QUERY_INT_STATE, "H_QUERY_INT_STATE"}, \ {H_POLL_PENDING, "H_POLL_PENDING"}, \ @@ -125,7 +126,25 @@ {H_COP, "H_COP"}, \ {H_GET_MPP_X, "H_GET_MPP_X"}, \ {H_SET_MODE, "H_SET_MODE"}, \ - {H_RTAS, "H_RTAS"} + {H_REGISTER_PROC_TBL, "H_REGISTER_PROC_TBL"}, \ + {H_QUERY_VAS_CAPABILITIES, "H_QUERY_VAS_CAPABILITIES"}, \ + {H_INT_GET_SOURCE_INFO, "H_INT_GET_SOURCE_INFO"}, \ + {H_INT_SET_SOURCE_CONFIG, "H_INT_SET_SOURCE_CONFIG"}, \ + {H_INT_GET_QUEUE_INFO, "H_INT_GET_QUEUE_INFO"}, \ + {H_INT_SET_QUEUE_CONFIG, "H_INT_SET_QUEUE_CONFIG"}, \ + {H_INT_ESB, "H_INT_ESB"}, \ + {H_INT_RESET, "H_INT_RESET"}, \ + {H_RPT_INVALIDATE, "H_RPT_INVALIDATE"}, \ + {H_RTAS, "H_RTAS"}, \ + {H_LOGICAL_MEMOP, "H_LOGICAL_MEMOP"}, \ + {H_CAS, "H_CAS"}, \ + {H_UPDATE_DT, "H_UPDATE_DT"}, \ + {H_GET_PERF_COUNTER_INFO, "H_GET_PERF_COUNTER_INFO"}, \ + {H_SET_PARTITION_TABLE, "H_SET_PARTITION_TABLE"}, \ + {H_ENTER_NESTED, "H_ENTER_NESTED"}, \ + {H_TLB_INVALIDATE, "H_TLB_INVALIDATE"}, \ + {H_COPY_TOFROM_GUEST, "H_COPY_TOFROM_GUEST"} + #define kvm_trace_symbol_kvmret \ {RESUME_GUEST, "RESUME_GUEST"}, \ -- cgit v1.2.3 From f5c847ea19d323974d6f7c7e9fa4858ce0727096 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Fri, 24 Jun 2022 11:27:12 -0300 Subject: KVM: PPC: Align pt_regs in kvm_vcpu_arch structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The H_ENTER_NESTED hypercall receives as second parameter the address of a region of memory containing the values for the nested guest privileged registers. We currently use the pt_regs structure contained within kvm_vcpu_arch for that end. Most hypercalls that receive a memory address expect that region to not cross a 4K page boundary. We would want H_ENTER_NESTED to follow the same pattern so this patch ensures the pt_regs structure sits within a page. Note: the pt_regs structure is currently 384 bytes in size, so aligning to 512 is sufficient to ensure it will not cross a 4K page and avoids punching too big a hole in struct kvm_vcpu_arch. Signed-off-by: Fabiano Rosas Signed-off-by: Murilo Opsfelder Araújo Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220624142712.790491-1-farosas@linux.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index eeba679802cf..c2b003550dc9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -523,7 +523,11 @@ struct kvm_vcpu_arch { struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; #endif - struct pt_regs regs; + /* + * This is passed along to the HV via H_ENTER_NESTED. Align to + * prevent it crossing a real 4K page. + */ + struct pt_regs regs __aligned(512); struct thread_fp_state fp; -- cgit v1.2.3 From 2a83afe72a2b5760155c2dd840c776aee292dc90 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 31 May 2022 16:59:36 +1000 Subject: powerpc/64: Drop ppc_inst_as_str() The ppc_inst_as_str() macro tries to make printing variable length, aka "prefixed", instructions convenient. It mostly succeeds, but it does hide an on-stack buffer, which triggers stack protector. More problematically it doesn't compile at all with GCC 12, with -Wdangling-pointer, due to the fact that it returns the char buffer declared inside the macro: arch/powerpc/kernel/trace/ftrace.c: In function '__ftrace_modify_call': ./include/linux/printk.h:475:44: error: using a dangling pointer to '__str' [-Werror=dangling-pointer=] 475 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) ... arch/powerpc/kernel/trace/ftrace.c:567:17: note: in expansion of macro 'pr_err' 567 | pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); | ^~~~~~ ./arch/powerpc/include/asm/inst.h:156:14: note: '__str' declared here 156 | char __str[PPC_INST_STR_LEN]; \ | ^~~~~ This could be fixed by having the caller declare the buffer, but in some places there'd need to be two buffers. In all cases where ppc_inst_as_str() is used the output is not really meant for user consumption, it's almost always indicative of a kernel bug. A simpler solution is to just print the value as an unsigned long. For normal instructions the output is identical. For prefixed instructions the value is printed as a single 64-bit quantity, whereas previously the low half was printed first. But that is good enough for debug output, especially as prefixed instructions will be rare in kernel code in practice. Old: c000000000111170 60420000 ori r2,r2,0 c000000000111174 04100001 e580fb00 .long 0xe580fb0004100001 New: c00000000010f90c 60420000 ori r2,r2,0 c00000000010f910 e580fb0004100001 .long 0xe580fb0004100001 Reported-by: Bagas Sanjaya Reported-by: Petr Mladek Signed-off-by: Michael Ellerman Tested-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20220531065936.3674348-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/inst.h | 19 ------------------- arch/powerpc/kernel/kprobes.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 24 +++++++++++++----------- arch/powerpc/lib/test_emulate_step.c | 6 +++--- arch/powerpc/xmon/xmon.c | 2 +- 5 files changed, 18 insertions(+), 35 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index b49aae9f6f27..684d3f453282 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -139,25 +139,6 @@ static inline void ppc_inst_write(u32 *ptr, ppc_inst_t x) *(u64 *)ptr = ppc_inst_as_ulong(x); } -#define PPC_INST_STR_LEN sizeof("00000000 00000000") - -static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], ppc_inst_t x) -{ - if (ppc_inst_prefixed(x)) - sprintf(str, "%08x %08x", ppc_inst_val(x), ppc_inst_suffix(x)); - else - sprintf(str, "%08x", ppc_inst_val(x)); - - return str; -} - -#define ppc_inst_as_str(x) \ -({ \ - char __str[PPC_INST_STR_LEN]; \ - __ppc_inst_as_str(__str, x); \ - __str; \ -}) - static inline int __copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) { unsigned int val, suffix; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 1c97c0f177ae..912d4f8a13be 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -269,7 +269,7 @@ static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) * So, we should never get here... but, its still * good to catch them, just in case... */ - printk("Can't step on instruction %s\n", ppc_inst_as_str(insn)); + printk("Can't step on instruction %08lx\n", ppc_inst_as_ulong(insn)); BUG(); } else { /* diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 2a893e06e4f1..cab67b5120b9 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -69,8 +69,8 @@ ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_t new) /* Make sure it is what we expect it to be */ if (!ppc_inst_equal(replaced, old)) { - pr_err("%p: replaced (%s) != old (%s)", - (void *)ip, ppc_inst_as_str(replaced), ppc_inst_as_str(old)); + pr_err("%p: replaced (%08lx) != old (%08lx)", (void *)ip, + ppc_inst_as_ulong(replaced), ppc_inst_as_ulong(old)); return -EINVAL; } @@ -127,7 +127,7 @@ __ftrace_make_nop(struct module *mod, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); return -EINVAL; } @@ -159,8 +159,8 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_MFLR(_R0))) && !ppc_inst_equal(op, ppc_inst(PPC_INST_STD_LR))) { - pr_err("Unexpected instruction %s around bl _mcount\n", - ppc_inst_as_str(op)); + pr_err("Unexpected instruction %08lx around bl _mcount\n", + ppc_inst_as_ulong(op)); return -EINVAL; } } else if (IS_ENABLED(CONFIG_PPC64)) { @@ -174,7 +174,8 @@ __ftrace_make_nop(struct module *mod, } if (!ppc_inst_equal(op, ppc_inst(PPC_INST_LD_TOC))) { - pr_err("Expected %08lx found %s\n", PPC_INST_LD_TOC, ppc_inst_as_str(op)); + pr_err("Expected %08lx found %08lx\n", PPC_INST_LD_TOC, + ppc_inst_as_ulong(op)); return -EINVAL; } } @@ -312,7 +313,7 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); return -EINVAL; } @@ -416,8 +417,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EFAULT; if (!expected_nop_sequence(ip, op[0], op[1])) { - pr_err("Unexpected call sequence at %p: %s %s\n", - ip, ppc_inst_as_str(op[0]), ppc_inst_as_str(op[1])); + pr_err("Unexpected call sequence at %p: %08lx %08lx\n", ip, + ppc_inst_as_ulong(op[0]), ppc_inst_as_ulong(op[1])); return -EINVAL; } @@ -486,7 +487,8 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) } if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_NOP()))) { - pr_err("Unexpected call sequence at %p: %s\n", ip, ppc_inst_as_str(op)); + pr_err("Unexpected call sequence at %p: %08lx\n", + ip, ppc_inst_as_ulong(op)); return -EINVAL; } @@ -564,7 +566,7 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); return -EINVAL; } diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 4f141daafcff..f2e47be05e8c 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -1616,11 +1616,11 @@ static int __init emulate_compute_instr(struct pt_regs *regs, if (analysed != 1 || GETTYPE(op.type) != COMPUTE) { if (negative) return -EFAULT; - pr_info("emulation failed, instruction = %s\n", ppc_inst_as_str(instr)); + pr_info("emulation failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); return -EFAULT; } if (analysed == 1 && negative) - pr_info("negative test failed, instruction = %s\n", ppc_inst_as_str(instr)); + pr_info("negative test failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); if (!negative) emulate_update_regs(regs, &op); return 0; @@ -1637,7 +1637,7 @@ static int __init execute_compute_instr(struct pt_regs *regs, /* Patch the NOP with the actual instruction */ patch_instruction_site(&patch__exec_instr, instr); if (exec_instr(regs)) { - pr_info("execution failed, instruction = %s\n", ppc_inst_as_str(instr)); + pr_info("execution failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); return -EFAULT; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 3d9782ea3fa7..f80c714f1d49 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3047,7 +3047,7 @@ generic_inst_dump(unsigned long adr, long count, int praddr, dotted = 0; last_inst = inst; if (praddr) - printf(REG" %s", adr, ppc_inst_as_str(inst)); + printf(REG" %08lx", adr, ppc_inst_as_ulong(inst)); printf("\t"); if (!ppc_inst_prefixed(inst)) dump_func(ppc_inst_val(inst), adr); -- cgit v1.2.3 From b21bd5a4b130f8370861478d2880985daace5913 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 28 Jun 2022 00:41:19 +0530 Subject: powerpc/bpf: Fix use of user_pt_regs in uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to build a .c file that includes : $ cat test_bpf_headers.c #include throws the below error: /usr/include/linux/bpf_perf_event.h:14:28: error: field ‘regs’ has incomplete type 14 | bpf_user_pt_regs_t regs; | ^~~~ This is because we typedef bpf_user_pt_regs_t to 'struct user_pt_regs' in arch/powerpc/include/uaps/asm/bpf_perf_event.h, but 'struct user_pt_regs' is not exposed to userspace. Powerpc has both pt_regs and user_pt_regs structures. However, unlike arm64 and s390, we expose user_pt_regs to userspace as just 'pt_regs'. As such, we should typedef bpf_user_pt_regs_t to 'struct pt_regs' for userspace. Within the kernel though, we want to typedef bpf_user_pt_regs_t to 'struct user_pt_regs'. Remove arch/powerpc/include/uapi/asm/bpf_perf_event.h so that the uapi/asm-generic version of the header is exposed to userspace. Introduce arch/powerpc/include/asm/bpf_perf_event.h so that we can typedef bpf_user_pt_regs_t to 'struct user_pt_regs' for use within the kernel. Note that this was not showing up with the bpf selftest build since tools/include/uapi/asm/bpf_perf_event.h didn't include the powerpc variant. Fixes: a6460b03f945ee ("powerpc/bpf: Fix broken uapi for BPF_PROG_TYPE_PERF_EVENT") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Naveen N. Rao [mpe: Use typical naming for header include guard] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220627191119.142867-1-naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/bpf_perf_event.h | 9 +++++++++ arch/powerpc/include/uapi/asm/bpf_perf_event.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 arch/powerpc/include/asm/bpf_perf_event.h delete mode 100644 arch/powerpc/include/uapi/asm/bpf_perf_event.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/bpf_perf_event.h b/arch/powerpc/include/asm/bpf_perf_event.h new file mode 100644 index 000000000000..e8a7b4ffb58c --- /dev/null +++ b/arch/powerpc/include/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BPF_PERF_EVENT_H +#define _ASM_POWERPC_BPF_PERF_EVENT_H + +#include + +typedef struct user_pt_regs bpf_user_pt_regs_t; + +#endif /* _ASM_POWERPC_BPF_PERF_EVENT_H */ diff --git a/arch/powerpc/include/uapi/asm/bpf_perf_event.h b/arch/powerpc/include/uapi/asm/bpf_perf_event.h deleted file mode 100644 index 5e1e648aeec4..000000000000 --- a/arch/powerpc/include/uapi/asm/bpf_perf_event.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ -#define _UAPI__ASM_BPF_PERF_EVENT_H__ - -#include - -typedef struct user_pt_regs bpf_user_pt_regs_t; - -#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ -- cgit v1.2.3 From 24a9c54182b3758801b8ca6c8c237cc2ff654732 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:24 +0200 Subject: context_tracking: Split user tracking Kconfig Context tracking is going to be used not only to track user transitions but also idle/IRQs/NMIs. The user tracking part will then become a separate feature. Prepare Kconfig for that. [ frederic: Apply Max Filippov feedback. ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- .../time/context-tracking/arch-support.txt | 6 ++--- arch/Kconfig | 4 +-- arch/arm/Kconfig | 2 +- arch/arm/kernel/entry-common.S | 4 +-- arch/arm/kernel/entry-header.S | 4 +-- arch/arm64/Kconfig | 2 +- arch/csky/Kconfig | 2 +- arch/csky/kernel/entry.S | 4 +-- arch/loongarch/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/context_tracking.h | 2 +- arch/riscv/Kconfig | 2 +- arch/riscv/kernel/entry.S | 6 ++--- arch/sparc/Kconfig | 2 +- arch/sparc/kernel/rtrap_64.S | 2 +- arch/x86/Kconfig | 4 +-- arch/xtensa/Kconfig | 2 +- arch/xtensa/kernel/entry.S | 4 +-- include/linux/context_tracking.h | 12 ++++----- include/linux/context_tracking_state.h | 4 +-- init/Kconfig | 4 +-- kernel/context_tracking.c | 6 ++++- kernel/sched/core.c | 2 +- kernel/time/Kconfig | 31 ++++++++++++++-------- 25 files changed, 65 insertions(+), 52 deletions(-) (limited to 'arch/powerpc/include') diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt index c9e0a16290e6..e59071a49090 100644 --- a/Documentation/features/time/context-tracking/arch-support.txt +++ b/Documentation/features/time/context-tracking/arch-support.txt @@ -1,7 +1,7 @@ # -# Feature name: context-tracking -# Kconfig: HAVE_CONTEXT_TRACKING -# description: arch supports context tracking for NO_HZ_FULL +# Feature name: user-context-tracking +# Kconfig: HAVE_CONTEXT_TRACKING_USER +# description: arch supports user context tracking for NO_HZ_FULL # ----------------------- | arch |status| diff --git a/arch/Kconfig b/arch/Kconfig index fcf9a41a4ef5..154b7b78da09 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -774,7 +774,7 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES and similar) by implementing an inline arch_within_stack_frames(), which is used by CONFIG_HARDENED_USERCOPY. -config HAVE_CONTEXT_TRACKING +config HAVE_CONTEXT_TRACKING_USER bool help Provide kernel/user boundaries probes necessary for subsystems @@ -785,7 +785,7 @@ config HAVE_CONTEXT_TRACKING protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on irq exit still need to be protected. -config HAVE_CONTEXT_TRACKING_OFFSTACK +config HAVE_CONTEXT_TRACKING_USER_OFFSTACK bool help Architecture neither relies on exception_enter()/exception_exit() diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 7630ba9cb6cc..9acc6aac5912 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -84,7 +84,7 @@ config ARM select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE select HAVE_ARM_SMCCC if CPU_V7 select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 7aa3ded4af92..37a0125fc926 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -28,7 +28,7 @@ #include "entry-header.S" saved_psr .req r8 -#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING) +#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER) saved_pc .req r9 #define TRACE(x...) x #else @@ -38,7 +38,7 @@ saved_pc .req lr .section .entry.text,"ax",%progbits .align 5 -#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING) || \ +#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER) || \ IS_ENABLED(CONFIG_DEBUG_RSEQ)) /* * This is the fast syscall return path. We do as little as possible here, diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 95def2b38d1c..99411fa91350 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -366,7 +366,7 @@ ALT_UP_B(.L1_\@) * between user and kernel mode. */ .macro ct_user_exit, save = 1 -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER .if \save stmdb sp!, {r0-r3, ip, lr} bl user_exit_callable @@ -378,7 +378,7 @@ ALT_UP_B(.L1_\@) .endm .macro ct_user_enter, save = 1 -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER .if \save stmdb sp!, {r0-r3, ip, lr} bl user_enter_callable diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1652a9800ebe..7c5dd2af9ca9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -174,7 +174,7 @@ config ARM64 select HAVE_C_RECORDMCOUNT select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 21d72b078eef..f55ba1745f7b 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -42,7 +42,7 @@ config CSKY select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_SECCOMP_FILTER - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_VIRT_CPU_ACCOUNTING_GEN select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index bc734d17c16f..547b4cd1b24b 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -19,7 +19,7 @@ .endm .macro context_tracking -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER mfcr a0, epsr btsti a0, 31 bt 1f @@ -159,7 +159,7 @@ ret_from_exception: and r10, r9 cmpnei r10, 0 bt exit_work -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER jbsr user_enter_callable #endif 1: diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1920d52653b4..130dc65f3c85 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -76,7 +76,7 @@ config LOONGARCH select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ASM_MODVERSIONS - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index db09d45d59ec..9457894db237 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -56,7 +56,7 @@ config MIPS select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES select HAVE_ASM_MODVERSIONS - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_TIF_NOHZ select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c2ce2e60c8f0..874c8d81284a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -202,7 +202,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ASM_MODVERSIONS - select HAVE_CONTEXT_TRACKING if PPC64 + select HAVE_CONTEXT_TRACKING_USER if PPC64 select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/include/asm/context_tracking.h b/arch/powerpc/include/asm/context_tracking.h index f2682b28b050..4b63931c49e0 100644 --- a/arch/powerpc/include/asm/context_tracking.h +++ b/arch/powerpc/include/asm/context_tracking.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_CONTEXT_TRACKING_H #define _ASM_POWERPC_CONTEXT_TRACKING_H -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER #define SCHEDULE_USER bl schedule_user #else #define SCHEDULE_USER bl schedule diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 32ffef9f6e5b..29b46f217345 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -86,7 +86,7 @@ config RISCV select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_EBPF_JIT if MMU diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 12f6bba57e33..b9eda3fcbd6d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -111,7 +111,7 @@ _save_context: call __trace_hardirqs_off #endif -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER /* If previous state is in user mode, call user_exit_callable(). */ li a0, SR_PP and a0, s1, a0 @@ -176,7 +176,7 @@ handle_syscall: */ csrs CSR_STATUS, SR_IE #endif -#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING) +#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER) /* Recover a0 - a7 for system calls */ REG_L a0, PT_A0(sp) REG_L a1, PT_A1(sp) @@ -269,7 +269,7 @@ resume_userspace: andi s1, s0, _TIF_WORK_MASK bnez s1, work_pending -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER call user_enter_callable #endif diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ba449c47effd..9232411a8821 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -71,7 +71,7 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_TIF_NOHZ select HAVE_DEBUG_KMEMLEAK select IOMMU_HELPER diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S index c5fd4b450d9b..eef102765a7e 100644 --- a/arch/sparc/kernel/rtrap_64.S +++ b/arch/sparc/kernel/rtrap_64.S @@ -15,7 +15,7 @@ #include #include -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER # define SCHEDULE_USER schedule_user #else # define SCHEDULE_USER schedule diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be0b95e51df6..b0a6dbbb760b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -186,8 +186,8 @@ config X86 select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER if X86_64 + select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 0b0f0172cced..7927fed7bc83 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -33,7 +33,7 @@ config XTENSA select HAVE_ARCH_KCSAN select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK - select HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index d72bcafae90c..fb67d85116e4 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -455,7 +455,7 @@ KABI_W or a3, a3, a2 abi_call trace_hardirqs_off 1: #endif -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER l32i abi_tmp0, a1, PT_PS bbci.l abi_tmp0, PS_UM_BIT, 1f abi_call user_exit_callable @@ -544,7 +544,7 @@ common_exception_return: j .Lrestore_state .Lexit_tif_loop_user: -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER abi_call user_enter_callable #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 63259fece7c7..e35ae66b4794 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -10,7 +10,7 @@ #include -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER extern void ct_cpu_track_user(int cpu); /* Called with interrupts disabled. */ @@ -52,7 +52,7 @@ static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; - if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) || + if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) || !context_tracking_enabled()) return 0; @@ -65,7 +65,7 @@ static inline enum ctx_state exception_enter(void) static inline void exception_exit(enum ctx_state prev_ctx) { - if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) && + if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) && context_tracking_enabled()) { if (prev_ctx != CONTEXT_KERNEL) ct_user_enter(prev_ctx); @@ -109,14 +109,14 @@ static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } static __always_inline bool context_tracking_guest_enter(void) { return false; } static inline void context_tracking_guest_exit(void) { } -#endif /* !CONFIG_CONTEXT_TRACKING */ +#endif /* !CONFIG_CONTEXT_TRACKING_USER */ #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) -#ifdef CONFIG_CONTEXT_TRACKING_FORCE +#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE extern void context_tracking_init(void); #else static inline void context_tracking_init(void) { } -#endif /* CONFIG_CONTEXT_TRACKING_FORCE */ +#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */ #endif diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index edc7b46376a6..2b46afe105a9 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -22,7 +22,7 @@ struct context_tracking { } state; }; -#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_CONTEXT_TRACKING_USER extern struct static_key_false context_tracking_key; DECLARE_PER_CPU(struct context_tracking, context_tracking); @@ -45,6 +45,6 @@ static inline bool context_tracking_enabled_this_cpu(void) static __always_inline bool context_tracking_enabled(void) { return false; } static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; } static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; } -#endif /* CONFIG_CONTEXT_TRACKING */ +#endif /* CONFIG_CONTEXT_TRACKING_USER */ #endif diff --git a/init/Kconfig b/init/Kconfig index c7900e8975f1..06454d19e2f0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -494,11 +494,11 @@ config VIRT_CPU_ACCOUNTING_NATIVE config VIRT_CPU_ACCOUNTING_GEN bool "Full dynticks CPU time accounting" - depends on HAVE_CONTEXT_TRACKING + depends on HAVE_CONTEXT_TRACKING_USER depends on HAVE_VIRT_CPU_ACCOUNTING_GEN depends on GENERIC_CLOCKEVENTS select VIRT_CPU_ACCOUNTING - select CONTEXT_TRACKING + select CONTEXT_TRACKING_USER help Select this option to enable task and CPU time accounting on full dynticks systems. This accounting is implemented by watching every diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index d361bd52e4e1..f3dec1be2bf6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -22,6 +22,8 @@ #include #include +#ifdef CONFIG_CONTEXT_TRACKING_USER + #define CREATE_TRACE_POINTS #include @@ -252,7 +254,7 @@ void __init ct_cpu_track_user(int cpu) initialized = true; } -#ifdef CONFIG_CONTEXT_TRACKING_FORCE +#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE void __init context_tracking_init(void) { int cpu; @@ -261,3 +263,5 @@ void __init context_tracking_init(void) ct_cpu_track_user(cpu); } #endif + +#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..883167a57bf9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6559,7 +6559,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) +#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 27b7868b5c30..41f99bcfe9e6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -73,6 +73,9 @@ config TIME_KUNIT_TEST If unsure, say N. +config CONTEXT_TRACKING + bool + if GENERIC_CLOCKEVENTS menu "Timers subsystem" @@ -111,7 +114,7 @@ config NO_HZ_FULL # NO_HZ_COMMON dependency # We need at least one periodic CPU for timekeeping depends on SMP - depends on HAVE_CONTEXT_TRACKING + depends on HAVE_CONTEXT_TRACKING_USER # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON @@ -137,31 +140,37 @@ config NO_HZ_FULL endchoice -config CONTEXT_TRACKING - bool +config CONTEXT_TRACKING_USER + bool + depends on HAVE_CONTEXT_TRACKING_USER + select CONTEXT_TRACKING + help + Track transitions between kernel and user on behalf of RCU and + tickless cputime accounting. The former case relies on context + tracking to enter/exit RCU extended quiescent states. -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING +config CONTEXT_TRACKING_USER_FORCE + bool "Force user context tracking" + depends on CONTEXT_TRACKING_USER default y if !NO_HZ_FULL help The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also + support the user context tracking subsystem. But there are also other dependencies to provide in order to make the full dynticks working. This option stands for testing when an arch implements the - context tracking backend but doesn't yet fulfill all the + user context tracking backend but doesn't yet fulfill all the requirements to make the full dynticks feature working. Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU + for user context tracking and the subsystems that rely on it: RCU userspace extended quiescent state and tickless cputime accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all + dynticks subsystem by forcing the user context tracking on all CPUs in the system. Say Y only if you're working on the development of an - architecture backend for the context tracking. + architecture backend for the user context tracking. Say N otherwise, this option brings an overhead that you don't want in production. -- cgit v1.2.3 From 65d9a9a60fd71be964effb2e94747a6acb6e7015 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 1 Jul 2022 13:04:04 +0530 Subject: kexec_file: drop weak attribute from functions As requested (http://lkml.kernel.org/r/87ee0q7b92.fsf@email.froward.int.ebiederm.org), this series converts weak functions in kexec to use the #ifdef approach. Quoting the 3e35142ef99fe ("kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add]") changelog: : Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") : [1], binutils (v2.36+) started dropping section symbols that it thought : were unused. This isn't an issue in general, but with kexec_file.c, gcc : is placing kexec_arch_apply_relocations[_add] into a separate : .text.unlikely section and the section symbol ".text.unlikely" is being : dropped. Due to this, recordmcount is unable to find a non-weak symbol in : .text.unlikely to generate a relocation record against. This patch (of 2); Drop __weak attribute from functions in kexec_file.c: - arch_kexec_kernel_image_probe() - arch_kimage_file_post_load_cleanup() - arch_kexec_kernel_image_load() - arch_kexec_locate_mem_hole() - arch_kexec_kernel_verify_sig() arch_kexec_kernel_image_load() calls into kexec_image_load_default(), so drop the static attribute for the latter. arch_kexec_kernel_verify_sig() is not overridden by any architecture, so drop the __weak attribute. Link: https://lkml.kernel.org/r/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/2cd7ca1fe4d6bb6ca38e3283c717878388ed6788.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Suggested-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Mimi Zohar --- arch/arm64/include/asm/kexec.h | 4 +++- arch/powerpc/include/asm/kexec.h | 9 ++++++++ arch/s390/include/asm/kexec.h | 3 +++ arch/x86/include/asm/kexec.h | 6 ++++++ include/linux/kexec.h | 44 ++++++++++++++++++++++++++++++++++------ kernel/kexec_file.c | 35 ++------------------------------ 6 files changed, 61 insertions(+), 40 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 9839bfc163d7..78d272b26ebd 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -115,7 +115,9 @@ extern const struct kexec_file_ops kexec_image_ops; struct kimage; -extern int arch_kimage_file_post_load_cleanup(struct kimage *image); +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + extern int load_other_segments(struct kimage *image, unsigned long kernel_load_addr, unsigned long kernel_size, char *initrd, unsigned long initrd_len, diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 2aefe14e1442..1e5e9b6ec78d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -120,6 +120,15 @@ int setup_purgatory(struct kimage *image, const void *slave_code, #ifdef CONFIG_PPC64 struct kexec_buf; +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len); +#define arch_kexec_kernel_image_probe arch_kexec_kernel_image_probe + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + +int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf); +#define arch_kexec_locate_mem_hole arch_kexec_locate_mem_hole + int load_crashdump_segments_ppc64(struct kimage *image, struct kexec_buf *kbuf); int setup_purgatory_ppc64(struct kimage *image, const void *slave_code, diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 649ecdcc8734..8886aadc11a3 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -92,5 +92,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif /*_S390_KEXEC_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6ad8d946cd3e..5ec359c1b50c 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -193,6 +193,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +void *arch_kexec_kernel_image_load(struct kimage *image); +#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 475683cd67f1..6958c6b471f4 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -188,21 +188,53 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +void *kexec_image_load_default(struct kimage *image); + +#ifndef arch_kexec_kernel_image_probe +static inline int +arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) +{ + return kexec_image_probe_default(image, buf, buf_len); +} +#endif + +#ifndef arch_kimage_file_post_load_cleanup +static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + return kexec_image_post_load_cleanup_default(image); +} +#endif + +#ifndef arch_kexec_kernel_image_load +static inline void *arch_kexec_kernel_image_load(struct kimage *image) +{ + return kexec_image_load_default(image); +} +#endif -/* Architectures may override the below functions */ -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len); -void *arch_kexec_kernel_image_load(struct kimage *image); -int arch_kimage_file_post_load_cleanup(struct kimage *image); #ifdef CONFIG_KEXEC_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len); #endif -int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); +#ifndef arch_kexec_locate_mem_hole +/** + * arch_kexec_locate_mem_hole - Find free memory to place the segments. + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + return kexec_locate_mem_hole(kbuf); +} +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9261c07b048..0c27c81351ee 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -62,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return kexec_image_probe_default(image, buf, buf_len); -} - -static void *kexec_image_load_default(struct kimage *image) +void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -80,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image) image->cmdline_buf_len); } -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return kexec_image_load_default(image); -} - int kexec_image_post_load_cleanup_default(struct kimage *image) { if (!image->fops || !image->fops->cleanup) @@ -93,11 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } -int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - return kexec_image_post_load_cleanup_default(image); -} - #ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) @@ -110,8 +93,7 @@ static int kexec_image_verify_sig_default(struct kimage *image, void *buf, return image->fops->verify_sig(buf, buf_len); } -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) +int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return kexec_image_verify_sig_default(image, buf, buf_len); } @@ -621,19 +603,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) return ret == 1 ? 0 : -EADDRNOTAVAIL; } -/** - * arch_kexec_locate_mem_hole - Find free memory to place the segments. - * @kbuf: Parameters for the memory search. - * - * On success, kbuf->mem will have the start address of the memory region found. - * - * Return: 0 on success, negative errno on error. - */ -int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) -{ - return kexec_locate_mem_hole(kbuf); -} - /** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. -- cgit v1.2.3 From 0738eceb6201691534df07e0928d0a6168a35787 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 1 Jul 2022 13:04:05 +0530 Subject: kexec: drop weak attribute from functions Drop __weak attribute from functions in kexec_core.c: - machine_kexec_post_load() - arch_kexec_protect_crashkres() - arch_kexec_unprotect_crashkres() - crash_free_reserved_phys_range() Link: https://lkml.kernel.org/r/c0f6219e03cb399d166d518ab505095218a902dd.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Suggested-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Mimi Zohar --- arch/arm64/include/asm/kexec.h | 16 ++++++++++++++-- arch/powerpc/include/asm/kexec.h | 5 +++++ arch/s390/include/asm/kexec.h | 11 +++++++++++ arch/x86/include/asm/kexec.h | 6 ++++++ include/linux/kexec.h | 32 ++++++++++++++++++++++++++++---- kernel/kexec_core.c | 27 --------------------------- 6 files changed, 64 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 78d272b26ebd..559bfae26715 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -84,16 +84,30 @@ static inline void crash_setup_regs(struct pt_regs *newregs, extern bool crash_is_nosave(unsigned long pfn); extern void crash_prepare_suspend(void); extern void crash_post_resume(void); + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range #else static inline bool crash_is_nosave(unsigned long pfn) {return false; } static inline void crash_prepare_suspend(void) {} static inline void crash_post_resume(void) {} #endif +struct kimage; + #if defined(CONFIG_KEXEC_CORE) void cpu_soft_restart(unsigned long el2_switch, unsigned long entry, unsigned long arg0, unsigned long arg1, unsigned long arg2); + +int machine_kexec_post_load(struct kimage *image); +#define machine_kexec_post_load machine_kexec_post_load + +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres #endif #define ARCH_HAS_KIMAGE_ARCH @@ -113,8 +127,6 @@ struct kimage_arch { #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_image_ops; -struct kimage; - int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 1e5e9b6ec78d..d6f4edfe4737 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -98,6 +98,11 @@ void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_co void kexec_copy_flush(struct kimage *image); +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PPC_RTAS) +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range +#endif + #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_elf64_ops; diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 8886aadc11a3..1bd08eb56d5f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -85,6 +85,17 @@ struct kimage_arch { extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; +#ifdef CONFIG_CRASH_DUMP +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range + +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres +#endif + #ifdef CONFIG_KEXEC_FILE struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5ec359c1b50c..a3760ca796aa 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,12 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres + #ifdef CONFIG_KEXEC_FILE struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6958c6b471f4..8107606ad1e8 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -390,7 +390,10 @@ extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); -int machine_kexec_post_load(struct kimage *image); + +#ifndef machine_kexec_post_load +static inline int machine_kexec_post_load(struct kimage *image) { return 0; } +#endif extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); @@ -423,10 +426,21 @@ extern bool kexec_in_progress; int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); -void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); -void arch_kexec_protect_crashkres(void); -void arch_kexec_unprotect_crashkres(void); +#ifndef arch_kexec_protect_crashkres +/* + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +static inline void arch_kexec_protect_crashkres(void) { } +#endif + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } +#endif #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) @@ -456,6 +470,16 @@ static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys) } #endif +#ifndef crash_free_reserved_phys_range +static inline void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); +} +#endif + static inline unsigned long virt_to_boot_phys(void *addr) { return phys_to_boot_phys(__pa((unsigned long)addr)); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4d34c78334ce..acd029b307e4 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -591,11 +591,6 @@ static void kimage_free_extra_pages(struct kimage *image) } -int __weak machine_kexec_post_load(struct kimage *image) -{ - return 0; -} - void kimage_terminate(struct kimage *image) { if (*image->entry != 0) @@ -1020,15 +1015,6 @@ size_t crash_get_memory_size(void) return size; } -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); -} - int crash_shrink_memory(unsigned long new_size) { int ret = 0; @@ -1225,16 +1211,3 @@ int kernel_kexec(void) mutex_unlock(&kexec_mutex); return error; } - -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_kexec_protect_crashkres(void) -{} - -void __weak arch_kexec_unprotect_crashkres(void) -{} -- cgit v1.2.3 From 6eac1eaf2105dd8a2daf3b47634c24fc956bc77a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:37 +0530 Subject: powerpc/mm: move protection_map[] inside the platform This moves protection_map[] inside the platform and while here, also enable ARCH_HAS_VM_GET_PAGE_PROT on 32 bit and nohash 64 (aka book3e/64) platforms via DECLARE_VM_GET_PAGE_PROT. Link: https://lkml.kernel.org/r/20220711070600.2378316-4-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Michael Ellerman Cc: Paul Mackerras Cc: Nicholas Piggin Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/pgtable.h | 20 +------------------- arch/powerpc/mm/pgtable.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c2ce2e60c8f0..1035d172c7dd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,7 +140,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT if PPC_BOOK3S_64 + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index d564d0ecd4cd..33f4bf8d22b0 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,25 +20,6 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ -/* Note due to the way vm flags are laid out, the bits are XWR */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - #ifndef __ASSEMBLY__ #ifndef MAX_PTRS_PER_PGD @@ -79,6 +60,7 @@ extern void paging_init(void); void poking_init(void); extern unsigned long ioremap_bot; +extern const pgprot_t protection_map[16]; /* * kern_addr_valid is intended to indicate whether an address is a valid diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index e6166b71d36d..cb2dcdb18f8e 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -472,3 +472,27 @@ out: return ret_pte; } EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif -- cgit v1.2.3 From 9592eef7c16ec5fb9f36c4d9abe8eeffc2e1d2f3 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 5 Jul 2022 20:48:41 +0200 Subject: random: remove CONFIG_ARCH_RANDOM When RDRAND was introduced, there was much discussion on whether it should be trusted and how the kernel should handle that. Initially, two mechanisms cropped up, CONFIG_ARCH_RANDOM, a compile time switch, and "nordrand", a boot-time switch. Later the thinking evolved. With a properly designed RNG, using RDRAND values alone won't harm anything, even if the outputs are malicious. Rather, the issue is whether those values are being *trusted* to be good or not. And so a new set of options were introduced as the real ones that people use -- CONFIG_RANDOM_TRUST_CPU and "random.trust_cpu". With these options, RDRAND is used, but it's not always credited. So in the worst case, it does nothing, and in the best case, maybe it helps. Along the way, CONFIG_ARCH_RANDOM's meaning got sort of pulled into the center and became something certain platforms force-select. The old options don't really help with much, and it's a bit odd to have special handling for these instructions when the kernel can deal fine with the existence or untrusted existence or broken existence or non-existence of that CPU capability. Simplify the situation by removing CONFIG_ARCH_RANDOM and using the ordinary asm-generic fallback pattern instead, keeping the two options that are actually used. For now it leaves "nordrand" for now, as the removal of that will take a different route. Acked-by: Michael Ellerman Acked-by: Catalin Marinas Acked-by: Borislav Petkov Acked-by: Heiko Carstens Acked-by: Greg Kroah-Hartman Signed-off-by: Jason A. Donenfeld --- arch/arm/include/asm/archrandom.h | 2 ++ arch/arm64/Kconfig | 8 ------- arch/arm64/include/asm/archrandom.h | 10 --------- arch/arm64/kernel/cpufeature.c | 2 -- arch/powerpc/Kconfig | 3 --- arch/powerpc/include/asm/archrandom.h | 3 --- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/platforms/microwatt/Kconfig | 1 - arch/powerpc/platforms/powernv/Kconfig | 1 - arch/powerpc/platforms/pseries/Kconfig | 1 - arch/s390/Kconfig | 15 ------------- arch/s390/configs/zfcpdump_defconfig | 1 - arch/s390/crypto/Makefile | 2 +- arch/s390/include/asm/archrandom.h | 3 --- arch/s390/kernel/setup.c | 2 -- arch/x86/Kconfig | 9 -------- arch/x86/include/asm/archrandom.h | 14 ++++-------- arch/x86/kernel/cpu/rdrand.c | 2 -- drivers/char/Kconfig | 1 - drivers/char/hw_random/s390-trng.c | 9 -------- include/asm-generic/Kbuild | 1 + include/asm-generic/archrandom.h | 25 ++++++++++++++++++++++ include/linux/random.h | 9 +------- .../testing/selftests/wireguard/qemu/kernel.config | 1 - 24 files changed, 34 insertions(+), 93 deletions(-) create mode 100644 include/asm-generic/archrandom.h (limited to 'arch/powerpc/include') diff --git a/arch/arm/include/asm/archrandom.h b/arch/arm/include/asm/archrandom.h index a8e84ca5c2ee..cc4714eb1a75 100644 --- a/arch/arm/include/asm/archrandom.h +++ b/arch/arm/include/asm/archrandom.h @@ -7,4 +7,6 @@ static inline bool __init smccc_probe_trng(void) return false; } +#include + #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1652a9800ebe..1880f71c2547 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1858,14 +1858,6 @@ config ARM64_E0PD This option enables E0PD for TTBR1 where available. -config ARCH_RANDOM - bool "Enable support for random number generation" - default y - help - Random number generation (part of the ARMv8.5 Extensions) - provides a high bandwidth, cryptographically secure - hardware random number generator. - config ARM64_AS_HAS_MTE # Initial support for MTE went in binutils 2.32.0, checked with # ".arch armv8.5-a+memtag" below. However, this was incomplete diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index 3a6b6d38c5b8..c3b9fa56af67 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -2,8 +2,6 @@ #ifndef _ASM_ARCHRANDOM_H #define _ASM_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include #include #include @@ -167,12 +165,4 @@ arch_get_random_seed_long_early(unsigned long *v) } #define arch_get_random_seed_long_early arch_get_random_seed_long_early -#else /* !CONFIG_ARCH_RANDOM */ - -static inline bool __init smccc_probe_trng(void) -{ - return false; -} - -#endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8d88433de81d..0e9462abeb77 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2416,7 +2416,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_e0pd, }, #endif -#ifdef CONFIG_ARCH_RANDOM { .desc = "Random Number Generator", .capability = ARM64_HAS_RNG, @@ -2428,7 +2427,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .min_field_value = 1, }, -#endif #ifdef CONFIG_ARM64_BTI { .desc = "Branch Target Identification", diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7aa12e88c580..623deb5bedf6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1252,9 +1252,6 @@ config PHYSICAL_START default "0x00000000" endif -config ARCH_RANDOM - def_bool n - config PPC_LIB_RHEAP bool diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 9a53e29680f4..25ba65df6b1a 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -2,8 +2,6 @@ #ifndef _ASM_POWERPC_ARCHRANDOM_H #define _ASM_POWERPC_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include static inline bool __must_check arch_get_random_long(unsigned long *v) @@ -35,7 +33,6 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return rc; } -#endif /* CONFIG_ARCH_RANDOM */ #ifdef CONFIG_PPC_POWERNV int powernv_hwrng_present(void); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 358d171ae8e0..6c1002043367 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -200,9 +200,7 @@ struct machdep_calls { ssize_t (*cpu_release)(const char *, size_t); #endif -#ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); -#endif }; extern void e500_idle(void); diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig index 5e320f49583a..6af443a1db99 100644 --- a/arch/powerpc/platforms/microwatt/Kconfig +++ b/arch/powerpc/platforms/microwatt/Kconfig @@ -6,7 +6,6 @@ config PPC_MICROWATT select PPC_ICS_NATIVE select PPC_ICP_NATIVE select PPC_UDBG_16550 - select ARCH_RANDOM help This option enables support for FPGA-based Microwatt implementations. diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 161dfe024085..e1a05c5a9004 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL select MMU_NOTIFIER diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7fd91d153a4..f4a647c1f0b2 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -19,7 +19,6 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_DOORBELL select HOTPLUG_CPU - select ARCH_RANDOM select FORCE_SMP select SWIOTLB default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8cd9e56c629b..9b6e4e7cb17b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -507,21 +507,6 @@ config KEXEC_SIG verification for the corresponding kernel image type being loaded in order for this to work. -config ARCH_RANDOM - def_bool y - prompt "s390 architectural random number generation API" - help - Enable the s390 architectural random number generation API - to provide random data for all consumers within the Linux - kernel. - - When enabled the arch_random_* functions declared in linux/random.h - are implemented. The implementation is based on the s390 CPACF - instruction subfunction TRNG which provides a real true random - number generator. - - If unsure, say Y. - config KERNEL_NOBP def_bool n prompt "Enable modified branch prediction for the kernel by default" diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index a87fcc45e307..f4976f611b94 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -15,7 +15,6 @@ CONFIG_TUNE_ZEC12=y # CONFIG_COMPAT is not set CONFIG_NR_CPUS=2 CONFIG_HZ_100=y -# CONFIG_ARCH_RANDOM is not set # CONFIG_RELOCATABLE is not set # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index c63abfeb6d17..1b1cc478fa94 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o -obj-$(CONFIG_ARCH_RANDOM) += arch_random.o +obj-y += arch_random.o crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 2c6e1c6ecbe7..0a1c2e66c709 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -11,8 +11,6 @@ #ifndef _ASM_S390_ARCHRANDOM_H #define _ASM_S390_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include #include #include @@ -50,5 +48,4 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return false; } -#endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 0a37f5de2863..ebad41afe355 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -876,10 +876,8 @@ static void __init setup_randomness(void) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); -#ifdef CONFIG_ARCH_RANDOM if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) static_branch_enable(&s390_arch_random_available); -#endif } /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e58798f636d4..ba13749c09c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1810,15 +1810,6 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT -config ARCH_RANDOM - def_bool y - prompt "x86 architectural random number generator" if EXPERT - help - Enable the x86 architectural RDRAND instruction - (Intel Bull Mountain technology) to generate random numbers. - If supported, this is a high bandwidth, cryptographically - secure hardware random number generator. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ebc248e49549..fb235b696175 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -65,10 +65,8 @@ static inline bool __must_check rdseed_int(unsigned int *v) /* * These are the generic interfaces; they must not be declared if the - * stubs in are to be invoked, - * i.e. CONFIG_ARCH_RANDOM is not defined. + * stubs in are to be invoked. */ -#ifdef CONFIG_ARCH_RANDOM static inline bool __must_check arch_get_random_long(unsigned long *v) { @@ -90,12 +88,8 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; } -extern void x86_init_rdrand(struct cpuinfo_x86 *c); - -#else /* !CONFIG_ARCH_RANDOM */ - -static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } - -#endif /* !CONFIG_ARCH_RANDOM */ +#ifndef CONFIG_UML +void x86_init_rdrand(struct cpuinfo_x86 *c); +#endif #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..8f216669ecb8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -26,7 +26,6 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { unsigned int changed = 0; @@ -63,4 +62,3 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) "RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); } -#endif diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 0b6c03643ddc..30192e123e5f 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -431,7 +431,6 @@ config ADI config RANDOM_TRUST_CPU bool "Initialize RNG using CPU RNG instructions" default y - depends on ARCH_RANDOM help Initialize the RNG using random numbers supplied by the CPU's RNG instructions (e.g. RDRAND), if supported and available. These diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c index 2beaa35c0d74..488808dc17a2 100644 --- a/drivers/char/hw_random/s390-trng.c +++ b/drivers/char/hw_random/s390-trng.c @@ -108,7 +108,6 @@ static ssize_t trng_counter_show(struct device *dev, { u64 dev_counter = atomic64_read(&trng_dev_counter); u64 hwrng_counter = atomic64_read(&trng_hwrng_counter); -#if IS_ENABLED(CONFIG_ARCH_RANDOM) u64 arch_counter = atomic64_read(&s390_arch_random_counter); return sysfs_emit(buf, @@ -118,14 +117,6 @@ static ssize_t trng_counter_show(struct device *dev, "total: %llu\n", dev_counter, hwrng_counter, arch_counter, dev_counter + hwrng_counter + arch_counter); -#else - return sysfs_emit(buf, - "trng: %llu\n" - "hwrng: %llu\n" - "total: %llu\n", - dev_counter, hwrng_counter, - dev_counter + hwrng_counter); -#endif } static DEVICE_ATTR(byte_counter, 0444, trng_counter_show, NULL); diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 8e47d483b524..36db8b9eb68a 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -5,6 +5,7 @@ # asm headers from the host architecutre.) mandatory-y += atomic.h +mandatory-y += archrandom.h mandatory-y += barrier.h mandatory-y += bitops.h mandatory-y += bug.h diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h new file mode 100644 index 000000000000..3a5ee202dd86 --- /dev/null +++ b/include/asm-generic/archrandom.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_ARCHRANDOM_H__ +#define __ASM_GENERIC_ARCHRANDOM_H__ + +static inline bool __must_check arch_get_random_long(unsigned long *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_int(unsigned int *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_seed_long(unsigned long *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +{ + return false; +} + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index 20e389a14e5c..865770e29f3e 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -106,14 +106,7 @@ declare_get_random_var_wait(long, unsigned long) */ #include -#ifdef CONFIG_ARCH_RANDOM -# include -#else -static inline bool __must_check arch_get_random_long(unsigned long *v) { return false; } -static inline bool __must_check arch_get_random_int(unsigned int *v) { return false; } -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { return false; } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { return false; } -#endif +#include /* * Called from the boot CPU during startup; not valid to call once diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index bad88f4b0a03..e1858ce7003f 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -58,7 +58,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_HZ_PERIODIC=n CONFIG_HIGH_RES_TIMERS=y -CONFIG_ARCH_RANDOM=y CONFIG_FILE_LOCKING=y CONFIG_POSIX_TIMERS=y CONFIG_DEVTMPFS=y -- cgit v1.2.3 From c6b2bd262b33aa2451f52aec2190131d1762945a Mon Sep 17 00:00:00 2001 From: Scott Cheloha Date: Wed, 13 Jul 2022 15:23:32 -0500 Subject: powerpc/pseries: hvcall.h: add H_WATCHDOG opcode, H_NOOP return code PAPR v2.12 defines a new hypercall, H_WATCHDOG. The hypercall permits guest control of one or more virtual watchdog timers. Add the opcode for the H_WATCHDOG hypercall to hvcall.h. While here, add a definition for H_NOOP, a possible return code for H_WATCHDOG. Signed-off-by: Scott Cheloha Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220713202335.1217647-2-cheloha@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 1d454c70e7c6..4457abe31e6e 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -87,6 +87,7 @@ #define H_P7 -60 #define H_P8 -61 #define H_P9 -62 +#define H_NOOP -63 #define H_TOO_BIG -64 #define H_UNSUPPORTED -67 #define H_OVERLAP -68 @@ -324,7 +325,8 @@ #define H_RPT_INVALIDATE 0x448 #define H_SCM_FLUSH 0x44C #define H_GET_ENERGY_SCALE_INFO 0x450 -#define MAX_HCALL_OPCODE H_GET_ENERGY_SCALE_INFO +#define H_WATCHDOG 0x45C +#define MAX_HCALL_OPCODE H_WATCHDOG /* Scope args for H_SCM_UNBIND_ALL */ #define H_UNBIND_SCOPE_ALL (0x1) -- cgit v1.2.3 From 1621563ec62ff143c7b817dd5eab0884cdfaf89d Mon Sep 17 00:00:00 2001 From: Scott Cheloha Date: Wed, 13 Jul 2022 15:23:33 -0500 Subject: powerpc/pseries: add FW_FEATURE_WATCHDOG flag PAPR v2.12 specifies a new optional function set, "hcall-watchdog", for the /rtas/ibm,hypertas-functions property. The presence of this function set indicates support for the H_WATCHDOG hypercall. Check for this function set and, if present, set the new FW_FEATURE_WATCHDOG flag. Signed-off-by: Scott Cheloha Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220713202335.1217647-3-cheloha@linux.ibm.com --- arch/powerpc/include/asm/firmware.h | 3 ++- arch/powerpc/platforms/pseries/firmware.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 8dddd34b8ecf..398e0b5e485f 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -55,6 +55,7 @@ #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) #define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) #define FW_FEATURE_ENERGY_SCALE_INFO ASM_CONST(0x0000040000000000) +#define FW_FEATURE_WATCHDOG ASM_CONST(0x0000080000000000) #ifndef __ASSEMBLY__ @@ -76,7 +77,7 @@ enum { FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY | - FW_FEATURE_ENERGY_SCALE_INFO, + FW_FEATURE_ENERGY_SCALE_INFO | FW_FEATURE_WATCHDOG, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 09c119b2f623..080108d129ed 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -67,6 +67,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_PAPR_SCM, "hcall-scm"}, {FW_FEATURE_RPT_INVALIDATE, "hcall-rpt-invalidate"}, {FW_FEATURE_ENERGY_SCALE_INFO, "hcall-energy-scale-info"}, + {FW_FEATURE_WATCHDOG, "hcall-watchdog"}, }; /* Build up the firmware features bitmask using the contents of -- cgit v1.2.3 From 4c9da83011c455c0791b1f5e4e84d454d4f4ae3c Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Mon, 11 Jul 2022 19:36:16 -0300 Subject: KVM: PPC: Book3S HV: Remove kvmhv_p9_[set,restore]_lpcr declarations The commit b1b1697ae0cc ("KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support") removed the last references to these functions. Fixes: b1b1697ae0cc ("KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support") Signed-off-by: Murilo Opsfelder Araujo Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220711223617.63625-2-muriloo@linux.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 91c9f937edcd..ff1336ab4c47 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -281,8 +281,6 @@ extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu); long kvmppc_read_intr(void); void kvmppc_bad_interrupt(struct pt_regs *regs); -void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip); -void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip); void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr); void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); -- cgit v1.2.3 From b8c7ee79b1a37442a910b8a313045fb9aa639911 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Mon, 11 Jul 2022 19:36:17 -0300 Subject: KVM: PPC: Book3s HV: Remove unused function kvmppc_bad_interrupt The commit fae5c9f3664b ("KVM: PPC: Book3S HV: remove ISA v3.0 and v3.1 support from P7/8 path") removed the last reference to the function. Fixes: fae5c9f3664b ("KVM: PPC: Book3S HV: remove ISA v3.0 and v3.1 support from P7/8 path") Signed-off-by: Murilo Opsfelder Araujo Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220711223617.63625-3-muriloo@linux.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 1 - arch/powerpc/kvm/book3s_hv_builtin.c | 18 ------------------ 2 files changed, 19 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index ff1336ab4c47..bbf5e2c5fe09 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -280,7 +280,6 @@ extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu); extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu); long kvmppc_read_intr(void); -void kvmppc_bad_interrupt(struct pt_regs *regs); void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr); void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 88a8f6473c4e..c15c6faedce5 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -489,24 +489,6 @@ static long kvmppc_read_one_intr(bool *again) return kvmppc_check_passthru(xisr, xirr, again); } -void kvmppc_bad_interrupt(struct pt_regs *regs) -{ - /* - * 100 could happen at any time, 200 can happen due to invalid real - * address access for example (or any time due to a hardware problem). - */ - if (TRAP(regs) == 0x100) { - get_paca()->in_nmi++; - system_reset_exception(regs); - get_paca()->in_nmi--; - } else if (TRAP(regs) == 0x200) { - machine_check_exception(regs); - } else { - die("Bad interrupt in KVM entry/exit code", regs, SIGABRT); - } - panic("Bad KVM trap"); -} - static void kvmppc_end_cede(struct kvm_vcpu *vcpu) { vcpu->arch.ceded = 0; -- cgit v1.2.3 From 1e9fdf21a4339b102539f476a9842e7526c01939 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jul 2022 09:18:03 +0200 Subject: mmu_gather: Remove per arch tlb_{start,end}_vma() Scattered across the archs are 3 basic forms of tlb_{start,end}_vma(). Provide two new MMU_GATHER_knobs to enumerate them and remove the per arch tlb_{start,end}_vma() implementations. - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range() but does *NOT* want to call it for each VMA. - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the invalidate across multiple VMAs if possible. With these it is possible to capture the three forms: 1) empty stubs; select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS 2) start: flush_cache_range(), end: empty; select MMU_GATHER_MERGE_VMAS 3) start: flush_cache_range(), end: flush_tlb_range(); default Obviously, if the architecture does not have flush_cache_range() then it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: David Miller Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/csky/include/asm/tlb.h | 13 ------------- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/tlb.h | 10 ---------- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/tlb.h | 2 -- arch/s390/Kconfig | 1 + arch/s390/include/asm/tlb.h | 3 --- arch/sparc/Kconfig | 2 ++ arch/sparc/include/asm/tlb_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/include/asm/tlb.h | 3 --- include/asm-generic/tlb.h | 21 +++++++++++++++++++-- 13 files changed, 32 insertions(+), 35 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/Kconfig b/arch/Kconfig index fcf9a41a4ef5..71b9272acb28 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE config MMU_GATHER_NO_RANGE bool + select MMU_GATHER_MERGE_VMAS + +config MMU_GATHER_NO_FLUSH_CACHE + bool + +config MMU_GATHER_MERGE_VMAS + bool config MMU_GATHER_NO_GATHER bool diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h index 3498e65f59f8..750d041938d8 100644 --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h @@ -4,19 +4,6 @@ #define __ASM_CSKY_TLB_H #include - -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 53a912befb62..b57daee98b89 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -108,6 +108,7 @@ config LOONGARCH select TRACE_IRQFLAGS_SUPPORT select USE_PERCPU_NUMA_NODE_ID select ZONE_DMA32 + select MMU_GATHER_MERGE_VMAS if MMU config 32BIT bool diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index 4f629ae9d5a9..dd24f5898f65 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u32 info, u64 addr) ); } -/* - * LoongArch doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) static void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7aa12e88c580..c235648fae23 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -256,6 +256,7 @@ config PPC select IRQ_FORCED_THREADING select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 09a9ae5f3656..b3de6102a907 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,8 +19,6 @@ #include -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry #define tlb_flush tlb_flush diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8cd9e56c629b..5a1a8dfda6f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -204,6 +204,7 @@ config S390 select IOMMU_SUPPORT if PCI select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_SG_DMA_LENGTH if PCI diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index fe6407f0eb1b..3a5c8fb590e5 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb #define pmd_free_tlb pmd_free_tlb diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ba449c47effd..4f7d1dfbc608 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -67,6 +67,8 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_FLUSH_CACHE select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 779a5a0f0608..3037187482db 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *mm); void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *); void flush_tlb_pending(void); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define tlb_flush(tlb) flush_tlb_pending() /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e58798f636d4..7fff10e15969 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,6 +245,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 1bfe979bb9bc..580636cdc257 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -2,9 +2,6 @@ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index ff3e82553a76..c1f03c1acbfc 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -158,9 +158,24 @@ * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * + * MMU_GATHER_NO_FLUSH_CACHE + * + * Indicates the architecture has flush_cache_range() but it needs *NOT* be called + * before unmapping a VMA. + * + * NOTE: strictly speaking we shouldn't have this knob and instead rely on + * flush_cache_range() being a NOP, except Sparc64 seems to be + * different here. + * + * MMU_GATHER_MERGE_VMAS + * + * Indicates the architecture wants to merge ranges over VMAs; typical when + * multiple range invalidates are more expensive than a full invalidate. + * * MMU_GATHER_NO_RANGE * - * Use this if your architecture lacks an efficient flush_tlb_range(). + * Use this if your architecture lacks an efficient flush_tlb_range(). This + * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * @@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * return; tlb_update_vma_flags(tlb, vma); +#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); +#endif } #endif #ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { - if (tlb->fullmm) + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* -- cgit v1.2.3 From ae85b23c65dbb079c83d8a08ff7699215f104e42 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 23 Jul 2022 06:49:41 +0900 Subject: PCI: Remove pci_get_legacy_ide_irq() and asm-generic/pci.h pci_get_legacy_ide_irq() is only used on platforms that support PNP, so many architectures define it but never use it. Replace uses of it with ATA_PRIMARY_IRQ() and ATA_SECONDARY_IRQ(), which provide the same functionality. Since pci_get_legacy_ide_irq() is no longer used, remove all the architecture-specific definitions of it as well as asm-generic/pci.h, which only provides pci_get_legacy_ide_irq() [bhelgaas: commit log] Co-developed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20220722214944.831438-2-shorne@gmail.com Signed-off-by: Arnd Bergmann Signed-off-by: Stafford Horne Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven Acked-by: Pierre Morel Acked-by: Rafael J. Wysocki --- arch/alpha/include/asm/pci.h | 6 ------ arch/arm/include/asm/pci.h | 5 ----- arch/arm64/include/asm/pci.h | 6 ------ arch/csky/include/asm/pci.h | 6 ------ arch/ia64/include/asm/pci.h | 6 ------ arch/m68k/include/asm/pci.h | 2 -- arch/mips/include/asm/pci.h | 6 ------ arch/parisc/include/asm/pci.h | 5 ----- arch/powerpc/include/asm/pci.h | 1 - arch/riscv/include/asm/pci.h | 6 ------ arch/s390/include/asm/pci.h | 1 - arch/sh/include/asm/pci.h | 6 ------ arch/sparc/include/asm/pci.h | 9 --------- arch/um/include/asm/pci.h | 8 -------- arch/x86/include/asm/pci.h | 3 --- arch/xtensa/include/asm/pci.h | 3 --- drivers/pnp/resource.c | 5 +++-- include/asm-generic/pci.h | 17 ----------------- 18 files changed, 3 insertions(+), 98 deletions(-) delete mode 100644 include/asm-generic/pci.h (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h index cf6bc1e64d66..6312656279d7 100644 --- a/arch/alpha/include/asm/pci.h +++ b/arch/alpha/include/asm/pci.h @@ -56,12 +56,6 @@ struct pci_controller { /* IOMMU controls. */ -/* TODO: integrate with include/asm-generic/pci.h ? */ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} - #define pci_domain_nr(bus) ((struct pci_controller *)(bus)->sysdata)->index static inline int pci_proc_domain(struct pci_bus *bus) diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h index 68e6f25784a4..5916b88d4c94 100644 --- a/arch/arm/include/asm/pci.h +++ b/arch/arm/include/asm/pci.h @@ -22,11 +22,6 @@ static inline int pci_proc_domain(struct pci_bus *bus) #define HAVE_PCI_MMAP #define ARCH_GENERIC_PCI_MMAP_RESOURCE -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} - extern void pcibios_report_status(unsigned int status_mask, int warn); #endif /* __KERNEL__ */ diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h index b33ca260e3c9..0aebc3488c32 100644 --- a/arch/arm64/include/asm/pci.h +++ b/arch/arm64/include/asm/pci.h @@ -23,12 +23,6 @@ extern int isa_dma_bridge_buggy; #ifdef CONFIG_PCI -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - /* no legacy IRQ on arm64 */ - return -ENODEV; -} - static inline int pci_proc_domain(struct pci_bus *bus) { return 1; diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h index ebc765b1f78b..0535f1aaae38 100644 --- a/arch/csky/include/asm/pci.h +++ b/arch/csky/include/asm/pci.h @@ -18,12 +18,6 @@ extern int isa_dma_bridge_buggy; #ifdef CONFIG_PCI -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - /* no legacy IRQ on csky */ - return -ENODEV; -} - static inline int pci_proc_domain(struct pci_bus *bus) { /* always show the domain in /proc */ diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index 8c163d1d0189..fa8f545c24c9 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -63,10 +63,4 @@ static inline int pci_proc_domain(struct pci_bus *bus) return (pci_domain_nr(bus) != 0); } -#define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? isa_irq_to_vector(15) : isa_irq_to_vector(14); -} - #endif /* _ASM_IA64_PCI_H */ diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h index 5a4bc223743b..ccdfa0dc8413 100644 --- a/arch/m68k/include/asm/pci.h +++ b/arch/m68k/include/asm/pci.h @@ -2,8 +2,6 @@ #ifndef _ASM_M68K_PCI_H #define _ASM_M68K_PCI_H -#include - #define pcibios_assign_all_busses() 1 #define PCIBIOS_MIN_IO 0x00000100 diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 9ffc8192adae..3fd6e22c108b 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -139,10 +139,4 @@ static inline int pci_proc_domain(struct pci_bus *bus) /* Do platform specific device initialization at pci_enable_device() time */ extern int pcibios_plat_dev_init(struct pci_dev *dev); -/* Chances are this interrupt is wired PC-style ... */ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} - #endif /* _ASM_PCI_H */ diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h index f14465b84de4..127ed5021ae3 100644 --- a/arch/parisc/include/asm/pci.h +++ b/arch/parisc/include/asm/pci.h @@ -162,11 +162,6 @@ extern void pcibios_init_bridge(struct pci_dev *); #define PCIBIOS_MIN_IO 0x10 #define PCIBIOS_MIN_MEM 0x1000 /* NBPG - but pci/setup-res.c dies */ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} - #define HAVE_PCI_MMAP #define ARCH_GENERIC_PCI_MMAP_RESOURCE diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 915d6ee4b40a..f9da506751bb 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -39,7 +39,6 @@ #define pcibios_assign_all_busses() \ (pci_has_flag(PCI_REASSIGN_ALL_BUS)) -#define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) { if (ppc_md.pci_get_legacy_ide_irq) diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h index 7fd52a30e605..a7b8f0d0df7f 100644 --- a/arch/riscv/include/asm/pci.h +++ b/arch/riscv/include/asm/pci.h @@ -23,12 +23,6 @@ extern int isa_dma_bridge_buggy; #ifdef CONFIG_PCI -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - /* no legacy IRQ on risc-v */ - return -ENODEV; -} - static inline int pci_proc_domain(struct pci_bus *bus) { /* always show the domain in /proc */ diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index fdb9745ee998..5889ddcbc374 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index ad22e88c6657..54c30126ea17 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -88,10 +88,4 @@ static inline int pci_proc_domain(struct pci_bus *bus) return hose->need_domain_info; } -/* Chances are this interrupt is wired PC-style ... */ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} - #endif /* __ASM_SH_PCI_H */ diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h index 4deddf430e5d..0c58f65bd172 100644 --- a/arch/sparc/include/asm/pci.h +++ b/arch/sparc/include/asm/pci.h @@ -40,13 +40,4 @@ static inline int pci_proc_domain(struct pci_bus *bus) #define get_pci_unmapped_area get_fb_unmapped_area #endif /* CONFIG_SPARC64 */ -#if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI) -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return PCI_IRQ_NONE; -} -#else -#include -#endif - #endif /* ___ASM_SPARC_PCI_H */ diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h index da13fd5519ef..26b96c02ef61 100644 --- a/arch/um/include/asm/pci.h +++ b/arch/um/include/asm/pci.h @@ -11,14 +11,6 @@ extern int isa_dma_bridge_buggy; -#ifdef CONFIG_PCI -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - /* no legacy IRQs */ - return -ENODEV; -} -#endif - #ifdef CONFIG_PCI_DOMAINS static inline int pci_proc_domain(struct pci_bus *bus) { diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f3fd5928bcbb..736793d65bcb 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -105,9 +105,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -/* generic pci stuff */ -#include - #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ static inline int __pcibus_to_node(const struct pci_bus *bus) diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h index 8e2b48a268db..b56de9635b6c 100644 --- a/arch/xtensa/include/asm/pci.h +++ b/arch/xtensa/include/asm/pci.h @@ -43,7 +43,4 @@ #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 #define arch_can_pci_mmap_io() 1 -/* Generic PCI */ -#include - #endif /* _XTENSA_PCI_H */ diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c index 2fa0f7d55259..8f7695624c8c 100644 --- a/drivers/pnp/resource.c +++ b/drivers/pnp/resource.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -322,8 +323,8 @@ static int pci_dev_uses_irq(struct pnp_dev *pnp, struct pci_dev *pci, * treat the compatibility IRQs as busy. */ if ((progif & 0x5) != 0x5) - if (pci_get_legacy_ide_irq(pci, 0) == irq || - pci_get_legacy_ide_irq(pci, 1) == irq) { + if (ATA_PRIMARY_IRQ(pci) == irq || + ATA_SECONDARY_IRQ(pci) == irq) { pnp_dbg(&pnp->dev, " legacy IDE device %s " "using irq %d\n", pci_name(pci), irq); return 1; diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h deleted file mode 100644 index 6bb3cd3d695a..000000000000 --- a/include/asm-generic/pci.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/asm-generic/pci.h - * - * Copyright (C) 2003 Russell King - */ -#ifndef _ASM_GENERIC_PCI_H -#define _ASM_GENERIC_PCI_H - -#ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ -static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -{ - return channel ? 15 : 14; -} -#endif /* HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ */ - -#endif /* _ASM_GENERIC_PCI_H */ -- cgit v1.2.3 From abb4970ac33514c84b143516583eaf8cc47abd67 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 23 Jul 2022 06:49:42 +0900 Subject: PCI: Move isa_dma_bridge_buggy out of asm/dma.h The isa_dma_bridge_buggy symbol is only used for x86_32, and only x86_32 platforms or quirks ever set it. Add a new linux/isa-dma.h header that #defines isa_dma_bridge_buggy to 0 except on x86_32, where we keep it as a variable, and remove all the arch- specific definitions. [bhelgaas: commit log] Suggested-by: Arnd Bergmann Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220722214944.831438-3-shorne@gmail.com Signed-off-by: Stafford Horne Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven --- arch/alpha/include/asm/dma.h | 9 --------- arch/arc/include/asm/dma.h | 5 ----- arch/arm/include/asm/dma.h | 6 ------ arch/arm64/include/asm/pci.h | 2 -- arch/csky/include/asm/pci.h | 2 -- arch/ia64/include/asm/dma.h | 2 -- arch/m68k/include/asm/dma.h | 6 ------ arch/microblaze/include/asm/dma.h | 6 ------ arch/mips/include/asm/dma.h | 8 -------- arch/parisc/include/asm/dma.h | 6 ------ arch/powerpc/include/asm/dma.h | 6 ------ arch/riscv/include/asm/pci.h | 2 -- arch/s390/include/asm/dma.h | 6 ------ arch/sh/include/asm/dma.h | 6 ------ arch/sparc/include/asm/dma.h | 8 -------- arch/um/include/asm/pci.h | 2 -- arch/x86/include/asm/dma.h | 8 -------- arch/xtensa/include/asm/dma.h | 7 ------- drivers/comedi/drivers/comedi_isadma.c | 2 +- drivers/pci/pci.c | 2 ++ drivers/pci/quirks.c | 4 +++- include/linux/isa-dma.h | 14 ++++++++++++++ sound/core/isadma.c | 2 +- 23 files changed, 21 insertions(+), 100 deletions(-) create mode 100644 include/linux/isa-dma.h (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/dma.h b/arch/alpha/include/asm/dma.h index 28610ea7786d..a04d76b96089 100644 --- a/arch/alpha/include/asm/dma.h +++ b/arch/alpha/include/asm/dma.h @@ -365,13 +365,4 @@ extern void free_dma(unsigned int dmanr); /* release it again */ #define KERNEL_HAVE_CHECK_DMA extern int check_dma(unsigned int dmanr); -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - - #endif /* _ASM_DMA_H */ diff --git a/arch/arc/include/asm/dma.h b/arch/arc/include/asm/dma.h index 5b744f4b10a7..02431027ed2f 100644 --- a/arch/arc/include/asm/dma.h +++ b/arch/arc/include/asm/dma.h @@ -7,10 +7,5 @@ #define ASM_ARC_DMA_H #define MAX_DMA_ADDRESS 0xC0000000 -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy 0 -#endif #endif diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index a81dda65c576..907d139be431 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -143,10 +143,4 @@ extern int get_dma_residue(unsigned int chan); #endif /* CONFIG_ISA_DMA_API */ -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* __ASM_ARM_DMA_H */ diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h index 0aebc3488c32..682c922b5658 100644 --- a/arch/arm64/include/asm/pci.h +++ b/arch/arm64/include/asm/pci.h @@ -20,8 +20,6 @@ #define arch_can_pci_mmap_wc() 1 #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 -extern int isa_dma_bridge_buggy; - #ifdef CONFIG_PCI static inline int pci_proc_domain(struct pci_bus *bus) { diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h index 0535f1aaae38..5c02454ec724 100644 --- a/arch/csky/include/asm/pci.h +++ b/arch/csky/include/asm/pci.h @@ -15,8 +15,6 @@ /* C-SKY shim does not initialize PCI bus */ #define pcibios_assign_all_busses() 1 -extern int isa_dma_bridge_buggy; - #ifdef CONFIG_PCI static inline int pci_proc_domain(struct pci_bus *bus) { diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h index 59625e9c1f9c..eaed2626ffda 100644 --- a/arch/ia64/include/asm/dma.h +++ b/arch/ia64/include/asm/dma.h @@ -12,8 +12,6 @@ extern unsigned long MAX_DMA_ADDRESS; -extern int isa_dma_bridge_buggy; - #define free_dma(x) #endif /* _ASM_IA64_DMA_H */ diff --git a/arch/m68k/include/asm/dma.h b/arch/m68k/include/asm/dma.h index f6c5e0dfb4e5..1c8d9c5bc2fa 100644 --- a/arch/m68k/include/asm/dma.h +++ b/arch/m68k/include/asm/dma.h @@ -6,10 +6,4 @@ bootmem allocator (but this should do it for this) */ #define MAX_DMA_ADDRESS PAGE_OFFSET -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _M68K_DMA_H */ diff --git a/arch/microblaze/include/asm/dma.h b/arch/microblaze/include/asm/dma.h index f801582be912..7484c9eb66c4 100644 --- a/arch/microblaze/include/asm/dma.h +++ b/arch/microblaze/include/asm/dma.h @@ -9,10 +9,4 @@ /* Virtual address corresponding to last available physical memory address. */ #define MAX_DMA_ADDRESS (CONFIG_KERNEL_START + memory_size - 1) -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_MICROBLAZE_DMA_H */ diff --git a/arch/mips/include/asm/dma.h b/arch/mips/include/asm/dma.h index be726b943530..d6186e6bea7e 100644 --- a/arch/mips/include/asm/dma.h +++ b/arch/mips/include/asm/dma.h @@ -307,12 +307,4 @@ static __inline__ int get_dma_residue(unsigned int dmanr) extern int request_dma(unsigned int dmanr, const char * device_id); /* reserve a DMA channel */ extern void free_dma(unsigned int dmanr); /* release it again */ -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_DMA_H */ diff --git a/arch/parisc/include/asm/dma.h b/arch/parisc/include/asm/dma.h index eea80ed34e6d..9e8c101de902 100644 --- a/arch/parisc/include/asm/dma.h +++ b/arch/parisc/include/asm/dma.h @@ -176,10 +176,4 @@ static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count) #define free_dma(dmanr) -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_DMA_H */ diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h index 6161a9596196..d97c66d9ae34 100644 --- a/arch/powerpc/include/asm/dma.h +++ b/arch/powerpc/include/asm/dma.h @@ -340,11 +340,5 @@ extern int request_dma(unsigned int dmanr, const char *device_id); /* release it again */ extern void free_dma(unsigned int dmanr); -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_DMA_H */ diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h index a7b8f0d0df7f..f904df586c03 100644 --- a/arch/riscv/include/asm/pci.h +++ b/arch/riscv/include/asm/pci.h @@ -20,8 +20,6 @@ #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 -extern int isa_dma_bridge_buggy; - #ifdef CONFIG_PCI static inline int pci_proc_domain(struct pci_bus *bus) { diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h index 6f26f35d4a71..dec1c4ce628c 100644 --- a/arch/s390/include/asm/dma.h +++ b/arch/s390/include/asm/dma.h @@ -11,10 +11,4 @@ */ #define MAX_DMA_ADDRESS 0x80000000 -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_S390_DMA_H */ diff --git a/arch/sh/include/asm/dma.h b/arch/sh/include/asm/dma.h index 17d23ae98c77..c8bee3f985a2 100644 --- a/arch/sh/include/asm/dma.h +++ b/arch/sh/include/asm/dma.h @@ -137,10 +137,4 @@ extern int register_chan_caps(const char *dmac, struct dma_chan_caps *capslist); extern int dma_create_sysfs_files(struct dma_channel *, struct dma_info *); extern void dma_remove_sysfs_files(struct dma_channel *, struct dma_info *); -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* __ASM_SH_DMA_H */ diff --git a/arch/sparc/include/asm/dma.h b/arch/sparc/include/asm/dma.h index 462e7c794a09..08043f35b110 100644 --- a/arch/sparc/include/asm/dma.h +++ b/arch/sparc/include/asm/dma.h @@ -82,14 +82,6 @@ #define DMA_BURST64 0x40 #define DMA_BURSTBITS 0x7f -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #ifdef CONFIG_SPARC32 struct device; diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h index 26b96c02ef61..1211855aff34 100644 --- a/arch/um/include/asm/pci.h +++ b/arch/um/include/asm/pci.h @@ -9,8 +9,6 @@ #define pcibios_assign_all_busses() 1 -extern int isa_dma_bridge_buggy; - #ifdef CONFIG_PCI_DOMAINS static inline int pci_proc_domain(struct pci_bus *bus) { diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 8e95aa4b0d17..8ae6e0e11b8b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); #endif -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_X86_DMA_H */ diff --git a/arch/xtensa/include/asm/dma.h b/arch/xtensa/include/asm/dma.h index bb099a373b5a..172644539032 100644 --- a/arch/xtensa/include/asm/dma.h +++ b/arch/xtensa/include/asm/dma.h @@ -52,11 +52,4 @@ extern int request_dma(unsigned int dmanr, const char * device_id); extern void free_dma(unsigned int dmanr); -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - - #endif diff --git a/drivers/comedi/drivers/comedi_isadma.c b/drivers/comedi/drivers/comedi_isadma.c index 700982464c53..020b3d1e1ac0 100644 --- a/drivers/comedi/drivers/comedi_isadma.c +++ b/drivers/comedi/drivers/comedi_isadma.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index cfaf40a540a8..60c55d2cb2cc 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -41,8 +41,10 @@ const char *pci_power_names[] = { }; EXPORT_SYMBOL_GPL(pci_power_names); +#ifdef CONFIG_X86_32 int isa_dma_bridge_buggy; EXPORT_SYMBOL(isa_dma_bridge_buggy); +#endif int pci_pci_problems; EXPORT_SYMBOL(pci_pci_problems); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 41aeaa235132..6fc64509eee7 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -17,6 +17,7 @@ #include #include #include +#include /* isa_dma_bridge_buggy */ #include #include #include @@ -30,7 +31,6 @@ #include #include #include -#include /* isa_dma_bridge_buggy */ #include "pci.h" static ktime_t fixup_debug_start(struct pci_dev *dev, @@ -239,6 +239,7 @@ static void quirk_passive_release(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441, quirk_passive_release); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441, quirk_passive_release); +#ifdef CONFIG_X86_32 /* * The VIA VP2/VP3/MVP3 seem to have some 'features'. There may be a * workaround but VIA don't answer queries. If you happen to have good @@ -265,6 +266,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, quirk_isa_dma DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_1, quirk_isa_dma_hangs); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_2, quirk_isa_dma_hangs); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_3, quirk_isa_dma_hangs); +#endif /* * Intel NM10 "TigerPoint" LPC PM1a_STS.BM_STS must be clear diff --git a/include/linux/isa-dma.h b/include/linux/isa-dma.h new file mode 100644 index 000000000000..61504a8c1b9e --- /dev/null +++ b/include/linux/isa-dma.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_ISA_DMA_H +#define __LINUX_ISA_DMA_H + +#include + +#if defined(CONFIG_PCI) && defined(CONFIG_X86_32) +extern int isa_dma_bridge_buggy; +#else +#define isa_dma_bridge_buggy (0) +#endif + +#endif /* __LINUX_ISA_DMA_H */ diff --git a/sound/core/isadma.c b/sound/core/isadma.c index 1f45ede023b4..18a86212e3a8 100644 --- a/sound/core/isadma.c +++ b/sound/core/isadma.c @@ -12,8 +12,8 @@ #undef HAVE_REALLY_SLOW_DMA_CONTROLLER #include +#include #include -#include /** * snd_dma_program - program an ISA DMA transfer -- cgit v1.2.3 From d349ab99eec7ab0f977fc4aac27aa476907acf90 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 17 Jul 2022 12:35:24 +0200 Subject: random: handle archrandom with multiple longs The archrandom interface was originally designed for x86, which supplies RDRAND/RDSEED for receiving random words into registers, resulting in one function to generate an int and another to generate a long. However, other architectures don't follow this. On arm64, the SMCCC TRNG interface can return between one and three longs. On s390, the CPACF TRNG interface can return arbitrary amounts, with four longs having the same cost as one. On UML, the os_getrandom() interface can return arbitrary amounts. So change the api signature to take a "max_longs" parameter designating the maximum number of longs requested, and then return the number of longs generated. Since callers need to check this return value and loop anyway, each arch implementation does not bother implementing its own loop to try again to fill the maximum number of longs. Additionally, all existing callers pass in a constant max_longs parameter. Taken together, these two things mean that the codegen doesn't really change much for one-word-at-a-time platforms, while performance is greatly improved on platforms such as s390. Acked-by: Heiko Carstens Acked-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Michael Ellerman Acked-by: Borislav Petkov Signed-off-by: Jason A. Donenfeld --- arch/arm64/include/asm/archrandom.h | 102 ++++++++++++++++------------------ arch/arm64/kernel/kaslr.c | 2 +- arch/powerpc/include/asm/archrandom.h | 30 ++-------- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/s390/include/asm/archrandom.h | 29 +++------- arch/um/include/asm/archrandom.h | 21 +++---- arch/x86/include/asm/archrandom.h | 41 ++------------ arch/x86/kernel/espfix_64.c | 2 +- drivers/char/random.c | 45 ++++++++++----- include/asm-generic/archrandom.h | 18 ++---- include/linux/random.h | 12 ++-- 11 files changed, 116 insertions(+), 188 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index c3b9fa56af67..109e2a4454be 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -58,7 +58,7 @@ static inline bool __arm64_rndrrs(unsigned long *v) return ok; } -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { /* * Only support the generic interface after we have detected @@ -66,27 +66,15 @@ static inline bool __must_check arch_get_random_long(unsigned long *v) * cpufeature code and with potential scheduling between CPUs * with and without the feature. */ - if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v)) - return true; - return false; + if (max_longs && cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v)) + return 1; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - if (cpus_have_const_cap(ARM64_HAS_RNG)) { - unsigned long val; - - if (__arm64_rndr(&val)) { - *v = val; - return true; - } - } - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - struct arm_smccc_res res; + if (!max_longs) + return 0; /* * We prefer the SMCCC call, since its semantics (return actual @@ -95,10 +83,23 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v) * (the output of a pseudo RNG freshly seeded by a TRNG). */ if (smccc_trng_available) { - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + struct arm_smccc_res res; + + max_longs = min_t(size_t, 3, max_longs); + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res); if ((int)res.a0 >= 0) { - *v = res.a3; - return true; + switch (max_longs) { + case 3: + *v++ = res.a1; + fallthrough; + case 2: + *v++ = res.a2; + fallthrough; + case 1: + *v++ = res.a3; + break; + } + return max_longs; } } @@ -108,32 +109,9 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v) * enough to implement this API if no other entropy source exists. */ if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndrrs(v)) - return true; + return 1; - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - struct arm_smccc_res res; - unsigned long val; - - if (smccc_trng_available) { - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 32, &res); - if ((int)res.a0 >= 0) { - *v = res.a3 & GENMASK(31, 0); - return true; - } - } - - if (cpus_have_const_cap(ARM64_HAS_RNG)) { - if (__arm64_rndrrs(&val)) { - *v = val; - return true; - } - } - - return false; + return 0; } static inline bool __init __early_cpu_has_rndr(void) @@ -143,26 +121,40 @@ static inline bool __init __early_cpu_has_rndr(void) return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf; } -static inline bool __init __must_check -arch_get_random_seed_long_early(unsigned long *v) +static inline size_t __init __must_check +arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); + if (!max_longs) + return 0; + if (smccc_trng_available) { struct arm_smccc_res res; - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + max_longs = min_t(size_t, 3, max_longs); + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res); if ((int)res.a0 >= 0) { - *v = res.a3; - return true; + switch (max_longs) { + case 3: + *v++ = res.a1; + fallthrough; + case 2: + *v++ = res.a2; + fallthrough; + case 1: + *v++ = res.a3; + break; + } + return max_longs; } } if (__early_cpu_has_rndr() && __arm64_rndr(v)) - return true; + return 1; - return false; + return 0; } -#define arch_get_random_seed_long_early arch_get_random_seed_long_early +#define arch_get_random_seed_longs_early arch_get_random_seed_longs_early #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 418b2bba1521..c5d541f358d3 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -106,7 +106,7 @@ u64 __init kaslr_early_init(void) * and supported. */ - if (arch_get_random_seed_long_early(&raw)) + if (arch_get_random_seed_longs_early(&raw, 1)) seed ^= raw; if (!seed) { diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 25ba65df6b1a..0e365c5b2396 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -4,34 +4,16 @@ #include -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - if (ppc_md.get_random_seed) - return ppc_md.get_random_seed(v); - - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - unsigned long val; - bool rc; - - rc = arch_get_random_seed_long(&val); - if (rc) - *v = val; - - return rc; + if (max_longs && ppc_md.get_random_seed && ppc_md.get_random_seed(v)) + return 1; + return 0; } #ifdef CONFIG_PPC_POWERNV diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e08fb3124dca..631062cde6b4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1207,7 +1207,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; #endif case H_RANDOM: - if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) + if (!arch_get_random_seed_longs(&vcpu->arch.regs.gpr[4], 1)) ret = H_HARDWARE; break; case H_RPT_INVALIDATE: diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 0a1c2e66c709..cf5e000df0a1 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -18,34 +18,19 @@ DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); extern atomic64_t s390_arch_random_counter; -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - if (static_branch_likely(&s390_arch_random_available)) { - cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); - atomic64_add(sizeof(*v), &s390_arch_random_counter); - return true; - } - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { if (static_branch_likely(&s390_arch_random_available)) { - cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); - atomic64_add(sizeof(*v), &s390_arch_random_counter); - return true; + cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v)); + atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter); + return max_longs; } - return false; + return 0; } #endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/um/include/asm/archrandom.h b/arch/um/include/asm/archrandom.h index 2f24cb96391d..24e16c979c51 100644 --- a/arch/um/include/asm/archrandom.h +++ b/arch/um/include/asm/archrandom.h @@ -7,24 +7,19 @@ /* This is from , but better not to #include that in a global header here. */ ssize_t os_getrandom(void *buf, size_t len, unsigned int flags); -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return os_getrandom(v, sizeof(*v), 0) == sizeof(*v); -} + ssize_t ret; -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return os_getrandom(v, sizeof(*v), 0) == sizeof(*v); -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return false; + ret = os_getrandom(v, max_longs * sizeof(*v), 0); + if (ret < 0) + return 0; + return ret / sizeof(*v); } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } #endif diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index fb235b696175..02bae8e0758b 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -31,20 +31,6 @@ static inline bool __must_check rdrand_long(unsigned long *v) return false; } -static inline bool __must_check rdrand_int(unsigned int *v) -{ - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { - asm volatile("rdrand %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - if (ok) - return true; - } while (--retry); - return false; -} - static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; @@ -54,38 +40,19 @@ static inline bool __must_check rdseed_long(unsigned long *v) return ok; } -static inline bool __must_check rdseed_int(unsigned int *v) -{ - bool ok; - asm volatile("rdseed %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - return ok; -} - /* * These are the generic interfaces; they must not be declared if the * stubs in are to be invoked. */ -static inline bool __must_check arch_get_random_long(unsigned long *v) -{ - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false; -} - -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } #ifndef CONFIG_UML diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4fe7af58cfe1..9417d5aa7305 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0c6568ae5f68..7bf11fa66265 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -596,12 +596,20 @@ static void extract_entropy(void *buf, size_t len) unsigned long rdseed[32 / sizeof(long)]; size_t counter; } block; - size_t i; + size_t i, longs; - for (i = 0; i < ARRAY_SIZE(block.rdseed); ++i) { - if (!arch_get_random_seed_long(&block.rdseed[i]) && - !arch_get_random_long(&block.rdseed[i])) - block.rdseed[i] = random_get_entropy(); + for (i = 0; i < ARRAY_SIZE(block.rdseed);) { + longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); + if (longs) { + i += longs; + continue; + } + longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); + if (longs) { + i += longs; + continue; + } + block.rdseed[i++] = random_get_entropy(); } spin_lock_irqsave(&input_pool.lock, flags); @@ -776,22 +784,31 @@ static struct notifier_block pm_notifier = { .notifier_call = random_pm_notifica int __init random_init(const char *command_line) { ktime_t now = ktime_get_real(); - unsigned int i, arch_bits; - unsigned long entropy; + size_t i, longs, arch_bits; + unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)]; #if defined(LATENT_ENTROPY_PLUGIN) static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy; _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed)); #endif - for (i = 0, arch_bits = BLAKE2S_BLOCK_SIZE * 8; - i < BLAKE2S_BLOCK_SIZE; i += sizeof(entropy)) { - if (!arch_get_random_seed_long_early(&entropy) && - !arch_get_random_long_early(&entropy)) { - entropy = random_get_entropy(); - arch_bits -= sizeof(entropy) * 8; + for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { + longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); + if (longs) { + _mix_pool_bytes(entropy, sizeof(*entropy) * longs); + i += longs; + continue; } - _mix_pool_bytes(&entropy, sizeof(entropy)); + longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); + if (longs) { + _mix_pool_bytes(entropy, sizeof(*entropy) * longs); + i += longs; + continue; + } + entropy[0] = random_get_entropy(); + _mix_pool_bytes(entropy, sizeof(*entropy)); + arch_bits -= sizeof(*entropy) * 8; + ++i; } _mix_pool_bytes(&now, sizeof(now)); _mix_pool_bytes(utsname(), sizeof(*(utsname()))); diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h index 3a5ee202dd86..3cd7f980cfdc 100644 --- a/include/asm-generic/archrandom.h +++ b/include/asm-generic/archrandom.h @@ -2,24 +2,14 @@ #ifndef __ASM_GENERIC_ARCHRANDOM_H__ #define __ASM_GENERIC_ARCHRANDOM_H__ -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - return false; + return 0; } #endif diff --git a/include/linux/random.h b/include/linux/random.h index 865770e29f3e..3fec206487f6 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -112,19 +112,19 @@ declare_get_random_var_wait(long, unsigned long) * Called from the boot CPU during startup; not valid to call once * secondary CPUs are up and preemption is possible. */ -#ifndef arch_get_random_seed_long_early -static inline bool __init arch_get_random_seed_long_early(unsigned long *v) +#ifndef arch_get_random_seed_longs_early +static inline size_t __init arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_seed_long(v); + return arch_get_random_seed_longs(v, max_longs); } #endif -#ifndef arch_get_random_long_early -static inline bool __init arch_get_random_long_early(unsigned long *v) +#ifndef arch_get_random_longs_early +static inline bool __init arch_get_random_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_long(v); + return arch_get_random_longs(v, max_longs); } #endif -- cgit v1.2.3 From f5e74e836097d1004077390717d4bd95d4a2c27a Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 13 Jul 2022 17:47:28 +0200 Subject: powerpc/watchdog: introduce a NMI watchdog's factor Introduce a factor which would apply to the NMI watchdog timeout. This factor is a percentage added to the watchdog_tresh value. The value is set under the watchdog_mutex protection and lockup_detector_reconfigure() is called to recompute wd_panic_timeout_tb. Once the factor is set, it remains until it is set back to 0, which means no impact. Signed-off-by: Laurent Dufour Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220713154729.80789-4-ldufour@linux.ibm.com --- arch/powerpc/include/asm/nmi.h | 2 ++ arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index ea0e487f87b1..c3c7adef74de 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -5,8 +5,10 @@ #ifdef CONFIG_PPC_WATCHDOG extern void arch_touch_nmi_watchdog(void); long soft_nmi_interrupt(struct pt_regs *regs); +void watchdog_nmi_set_timeout_pct(u64 pct); #else static inline void arch_touch_nmi_watchdog(void) {} +static inline void watchdog_nmi_set_timeout_pct(u64 pct) {} #endif #ifdef CONFIG_NMI_IPI diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index dd882d00845d..dbcc4a793f0b 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; +#ifdef CONFIG_PPC_PSERIES +static u64 wd_timeout_pct; +#endif + /* * Try to take the exclusive watchdog action / NMI IPI / printing lock. * wd_smp_lock must be held. If this fails, we should return and wait @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu) static void watchdog_calc_timeouts(void) { - wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + u64 threshold = watchdog_thresh; + +#ifdef CONFIG_PPC_PSERIES + threshold += (READ_ONCE(wd_timeout_pct) * threshold) / 100; +#endif + + wd_panic_timeout_tb = threshold * ppc_tb_freq; /* Have the SMP detector trigger a bit later */ wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void) } return 0; } + +#ifdef CONFIG_PPC_PSERIES +void watchdog_nmi_set_timeout_pct(u64 pct) +{ + pr_info("Set the NMI watchdog timeout factor to %llu%%\n", pct); + WRITE_ONCE(wd_timeout_pct, pct); + lockup_detector_reconfigure(); +} +#endif -- cgit v1.2.3 From a2954a7e47b60bb8d8c51f1b4d55af7585a4108a Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Wed, 6 Jul 2022 12:43:04 +0200 Subject: powerpc/pci: Hide pci_device_from_OF_node() for non-powermac code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function pci_device_from_OF_node() is used only in powermac code. So hide it from all other platforms. Signed-off-by: Pali Rohár Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220706104308.5390-2-pali@kernel.org --- arch/powerpc/include/asm/pci-bridge.h | 2 ++ arch/powerpc/kernel/pci_32.c | 2 ++ arch/powerpc/kernel/pci_64.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index c85f901227c9..98156932a1f5 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -170,8 +170,10 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) return bus->sysdata; } +#ifdef CONFIG_PPC_PMAC extern int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn); +#endif #ifndef CONFIG_PPC64 extern void pci_create_OF_bus_map(void); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 5a174936c9a0..c3b91fb62a71 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -154,6 +154,7 @@ pcibios_make_OF_bus_map(void) } +#ifdef CONFIG_PPC_PMAC /* * Returns the PCI device matching a given OF node */ @@ -193,6 +194,7 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) return -ENODEV; } EXPORT_SYMBOL(pci_device_from_OF_node); +#endif /* We create the "pci-OF-bus-map" property now so it appears in the * /proc device tree diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 19b03ddf5631..0c7cfb9fab04 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -286,6 +286,7 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif +#ifdef CONFIG_PPC_PMAC int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) { if (!PCI_DN(np)) @@ -294,3 +295,4 @@ int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) *devfn = PCI_DN(np)->devfn; return 0; } +#endif -- cgit v1.2.3 From 704544588735b2e1115212dbba1c210b3687ff7a Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Wed, 6 Jul 2022 12:43:06 +0200 Subject: powerpc/pci: Hide pci_create_OF_bus_map() for non-chrp code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function pci_create_OF_bus_map() is used only in chrp code. So hide it from all other platforms. Signed-off-by: Pali Rohár Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220706104308.5390-4-pali@kernel.org --- arch/powerpc/include/asm/pci-bridge.h | 2 ++ arch/powerpc/kernel/pci_32.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 98156932a1f5..e18c95f4e1d4 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -176,7 +176,9 @@ extern int pci_device_from_OF_node(struct device_node *node, #endif #ifndef CONFIG_PPC64 +#ifdef CONFIG_PPC_CHRP extern void pci_create_OF_bus_map(void); +#endif #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index df981294df29..3291af89cea4 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -194,6 +194,7 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) EXPORT_SYMBOL(pci_device_from_OF_node); #endif +#ifdef CONFIG_PPC_CHRP /* We create the "pci-OF-bus-map" property now so it appears in the * /proc device tree */ @@ -218,6 +219,7 @@ pci_create_OF_bus_map(void) of_node_put(dn); } } +#endif void pcibios_setup_phb_io_space(struct pci_controller *hose) { -- cgit v1.2.3 From fd193f85d3206cc7e7aeea2b6033d105cca38d01 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 25 May 2022 12:23:58 +1000 Subject: powerpc/64s: Remove spurious fault flushing for NMMU Commit 6d8278c414cb2 ("powerpc/64s/radix: do not flush TLB on spurious fault") removed the TLB flush for spurious faults, except when a coprocessor (nest MMU) maps the address space. This is not needed because the NMMU workaround in the PTE permission upgrade paths prevents PTEs existing with less restrictive access permissions than their corresponding TLB entries have. Remove it and replace with a comment. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220525022358.780745-4-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index d2e80f178b6d..206f920fe5b9 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -138,9 +138,29 @@ static inline void flush_all_mm(struct mm_struct *mm) static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, unsigned long address) { - /* See ptep_set_access_flags comment */ - if (atomic_read(&vma->vm_mm->context.copros) > 0) - flush_tlb_page(vma, address); + /* + * Book3S 64 does not require spurious fault flushes because the PTE + * must be re-fetched in case of an access permission problem. So the + * only reason for a spurious fault should be concurrent modification + * to the PTE, in which case the PTE will eventually be re-fetched by + * the MMU when it attempts the access again. + * + * See: Power ISA Version 3.1B, 6.10.1.2 Modifying a Translation Table + * Entry, Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates): + * + * "If the only change being made to a valid PTE that is subject to + * atomic hardware updates is to set the Reference or Change bit to + * 1 or to upgrade access authority, a simpler sequence suffices + * because the translation hardware will refetch the PTE if an + * access is attempted for which the only problems were reference + * and/or change bits needing to be set or insufficient access + * authority." + * + * The nest MMU in POWER9 does not perform this PTE re-fetch, but + * it avoids the spurious fault problem by flushing the TLB before + * upgrading PTE permissions, see radix__ptep_set_access_flags. + */ } extern bool tlbie_capable; -- cgit v1.2.3 From 62ccae78820b25a0ac64bb0f648388ec834fcb3c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Jul 2022 16:37:18 +0200 Subject: powerpc: Remove remaining parts of oprofile Commit 9850b6c69356 ("arch: powerpc: Remove oprofile") removed oprofile. Remove all remaining parts of it. Signed-off-by: Christophe Leroy Acked-by: Viresh Kumar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/298432fe1a14c0a415760011d72c3f0999efd5e2.1657204631.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/cputable.h | 3 -- arch/powerpc/kernel/cputable.c | 67 +------------------------------ arch/powerpc/kernel/dt_cpu_ftrs.c | 4 -- arch/powerpc/platforms/cell/spufs/spufs.h | 2 +- arch/powerpc/platforms/ps3/Kconfig | 2 +- 5 files changed, 4 insertions(+), 74 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 549eb6dd146f..ae8c3e13cfce 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -70,9 +70,6 @@ struct cpu_spec { /* Used to restore cpu setup on secondary processors and at resume */ cpu_restore_t cpu_restore; - /* Used by oprofile userspace to select the right counters */ - char *oprofile_cpu_type; - /* Name of processor class, for the ELF AT_PLATFORM entry */ char *platform; diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index a5dbfccd2047..d8e42ef750f1 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -149,7 +149,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", .platform = "ppc970", }, { /* PPC970FX */ @@ -166,7 +165,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", .platform = "ppc970", }, { /* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */ @@ -183,7 +181,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970MP", .platform = "ppc970", }, { /* PPC970MP */ @@ -200,7 +197,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970MP, .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970MP", .platform = "ppc970", }, { /* PPC970GX */ @@ -216,7 +212,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 8, .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", .platform = "ppc970", }, { /* Power5 GR */ @@ -230,7 +225,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power5", .platform = "power5", }, { /* Power5++ */ @@ -243,7 +237,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, - .oprofile_cpu_type = "ppc64/power5++", .platform = "power5+", }, { /* Power5 GS */ @@ -257,7 +250,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power5+", .platform = "power5+", }, { /* POWER6 in P5+ mode; 2.04-compliant processor */ @@ -269,7 +261,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER5, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .platform = "power5+", }, { /* Power6 */ @@ -284,7 +275,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power6", .platform = "power6x", }, { /* 2.05-compliant processor, i.e. Power6 "architected" mode */ @@ -296,7 +286,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER6, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .platform = "power6", }, { /* 2.06-compliant processor, i.e. Power7 "architected" mode */ @@ -309,7 +298,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER7, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, .machine_check_early = __machine_check_early_realmode_p7, @@ -325,7 +313,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER8, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, .machine_check_early = __machine_check_early_realmode_p8, @@ -341,7 +328,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER9, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .platform = "power9", @@ -356,7 +342,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .mmu_features = MMU_FTRS_POWER10, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power10, .cpu_restore = __restore_cpu_power10, .platform = "power10", @@ -373,7 +358,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power7", .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, .machine_check_early = __machine_check_early_realmode_p7, @@ -391,7 +375,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power7", .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, .machine_check_early = __machine_check_early_realmode_p7, @@ -409,7 +392,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, .machine_check_early = __machine_check_early_realmode_p8, @@ -427,7 +409,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, .machine_check_early = __machine_check_early_realmode_p8, @@ -445,7 +426,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, .machine_check_early = __machine_check_early_realmode_p8, @@ -463,7 +443,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .machine_check_early = __machine_check_early_realmode_p9, @@ -481,7 +460,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .machine_check_early = __machine_check_early_realmode_p9, @@ -499,7 +477,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .machine_check_early = __machine_check_early_realmode_p9, @@ -517,7 +494,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .machine_check_early = __machine_check_early_realmode_p9, @@ -535,7 +511,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power10", .cpu_setup = __setup_cpu_power10, .cpu_restore = __restore_cpu_power10, .machine_check_early = __machine_check_early_realmode_p10, @@ -554,7 +529,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .dcache_bsize = 128, .num_pmcs = 4, .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/cell-be", .platform = "ppc-cell-be", }, { /* PA Semi PA6T */ @@ -570,7 +544,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .pmc_type = PPC_PMC_PA6T, .cpu_setup = __setup_cpu_pa6t, .cpu_restore = __restore_cpu_pa6t, - .oprofile_cpu_type = "ppc64/pa6t", .platform = "pa6t", }, { /* default match */ @@ -734,7 +707,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_750, .machine_check = machine_check_generic, .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", }, { /* 745/755 */ .pvr_mask = 0xfffff000, @@ -765,7 +737,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_750, .machine_check = machine_check_generic, .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", }, { /* 750FX rev 2.0 must disable HID0[DPM] */ .pvr_mask = 0xffffffff, @@ -781,7 +752,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_750, .machine_check = machine_check_generic, .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", }, { /* 750FX (All revs except 2.0) */ .pvr_mask = 0xffff0000, @@ -797,7 +767,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_750fx, .machine_check = machine_check_generic, .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", }, { /* 750GX */ .pvr_mask = 0xffff0000, @@ -813,7 +782,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_750fx, .machine_check = machine_check_generic, .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", }, { /* 740/750 (L2CR bit need fixup for 740) */ .pvr_mask = 0xffff0000, @@ -891,7 +859,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -908,7 +875,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -925,7 +891,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -942,7 +907,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -959,7 +923,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -976,7 +939,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -993,7 +955,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -1010,7 +971,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -1026,7 +986,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -1043,7 +1002,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -1060,7 +1018,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .num_pmcs = 6, .pmc_type = PPC_PMC_G4, .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", .machine_check = machine_check_generic, .platform = "ppc7450", }, @@ -1172,7 +1129,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_603, .machine_check = machine_check_83xx, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e300", .platform = "ppc603", }, { /* e300c4 (e300c1, plus one IU) */ @@ -1188,7 +1144,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_603, .machine_check = machine_check_83xx, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e300", .platform = "ppc603", }, #endif @@ -1884,7 +1839,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500", .cpu_setup = __setup_cpu_e500v1, .machine_check = machine_check_e500, .platform = "ppc8540", @@ -1903,7 +1857,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500", .cpu_setup = __setup_cpu_e500v2, .machine_check = machine_check_e500, .platform = "ppc8548", @@ -1922,7 +1875,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 64, .dcache_bsize = 64, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500mc", .cpu_setup = __setup_cpu_e500mc, .machine_check = machine_check_e500mc, .platform = "ppce500mc", @@ -1943,7 +1895,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 64, .dcache_bsize = 64, .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500mc", .cpu_setup = __setup_cpu_e5500, #ifndef CONFIG_PPC32 .cpu_restore = __restore_cpu_e5500, @@ -1965,7 +1916,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .icache_bsize = 64, .dcache_bsize = 64, .num_pmcs = 6, - .oprofile_cpu_type = "ppc/e6500", .cpu_setup = __setup_cpu_e6500, #ifndef CONFIG_PPC32 .cpu_restore = __restore_cpu_e6500, @@ -2033,23 +1983,10 @@ static struct cpu_spec * __init setup_cpu_spec(unsigned long offset, t->pmc_type = old.pmc_type; /* - * If we have passed through this logic once before and - * have pulled the default case because the real PVR was - * not found inside cpu_specs[], then we are possibly - * running in compatibility mode. In that case, let the - * oprofiler know which set of compatibility counters to - * pull from by making sure the oprofile_cpu_type string - * is set to that of compatibility mode. If the - * oprofile_cpu_type already has a value, then we are - * possibly overriding a real PVR with a logical one, - * and, in that case, keep the current value for - * oprofile_cpu_type. Furthermore, let's ensure that the + * Let's ensure that the * fix for the PMAO bug is enabled on compatibility mode. */ - if (old.oprofile_cpu_type != NULL) { - t->oprofile_cpu_type = old.oprofile_cpu_type; - t->cpu_features |= old.cpu_features & CPU_FTR_PMAO_BUG; - } + t->cpu_features |= old.cpu_features & CPU_FTR_PMAO_BUG; } *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 2ad365c21afa..fc800a9fb2c4 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -102,7 +102,6 @@ static struct cpu_spec __initdata base_cpu_spec = { .dcache_bsize = 32, /* cache info init. */ .num_pmcs = 0, .pmc_type = PPC_PMC_DEFAULT, - .oprofile_cpu_type = NULL, .cpu_setup = NULL, .cpu_restore = __restore_cpu_cpufeatures, .machine_check_early = NULL, @@ -387,7 +386,6 @@ static int __init feat_enable_pmu_power8(struct dt_cpu_feature *f) cur_cpu_spec->num_pmcs = 6; cur_cpu_spec->pmc_type = PPC_PMC_IBM; - cur_cpu_spec->oprofile_cpu_type = "ppc64/power8"; return 1; } @@ -423,7 +421,6 @@ static int __init feat_enable_pmu_power9(struct dt_cpu_feature *f) cur_cpu_spec->num_pmcs = 6; cur_cpu_spec->pmc_type = PPC_PMC_IBM; - cur_cpu_spec->oprofile_cpu_type = "ppc64/power9"; return 1; } @@ -449,7 +446,6 @@ static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f) cur_cpu_spec->num_pmcs = 6; cur_cpu_spec->pmc_type = PPC_PMC_IBM; - cur_cpu_spec->oprofile_cpu_type = "ppc64/power10"; return 1; } diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h index afc1d6604d12..23c6799cfa5a 100644 --- a/arch/powerpc/platforms/cell/spufs/spufs.h +++ b/arch/powerpc/platforms/cell/spufs/spufs.h @@ -76,7 +76,7 @@ struct spu_context { struct address_space *mss; /* 'mss' area mappings. */ struct address_space *psmap; /* 'psmap' area mappings. */ struct mutex mapping_lock; - u64 object_id; /* user space pointer for oprofile */ + u64 object_id; /* user space pointer for GNU Debugger */ enum { SPU_STATE_RUNNABLE, SPU_STATE_SAVED } state; struct mutex state_mutex; diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index 610682caabc4..a44869e5ea70 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -165,7 +165,7 @@ config PS3_LPM If you intend to use the advanced performance monitoring and profiling support of the Cell processor with programs like - oprofile and perfmon2, then say Y or M, otherwise say N. + perfmon2, then say Y or M, otherwise say N. config PS3GELIC_UDBG bool "PS3 udbg output via UDP broadcasts on Ethernet" -- cgit v1.2.3 From 7b48377e1d9f68a1b58dfd22d40b083f38ce76a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Jul 2022 16:55:14 +0200 Subject: powerpc/probes: Remove ppc_opcode_t ppc_opcode_t is just an u32. There is no point in hiding u32 behind such a typedef. Remove it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b2d762191b095530789ac8b71b167c6740bb6aed.1657205708.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/kprobes.h | 2 +- arch/powerpc/include/asm/probes.h | 1 - arch/powerpc/include/asm/uprobes.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index bab364152b29..c8e4b4fd4e33 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -29,7 +29,7 @@ struct pt_regs; struct kprobe; -typedef ppc_opcode_t kprobe_opcode_t; +typedef u32 kprobe_opcode_t; extern kprobe_opcode_t optinsn_slot; diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h index 6f66e358aa37..00634e3145e7 100644 --- a/arch/powerpc/include/asm/probes.h +++ b/arch/powerpc/include/asm/probes.h @@ -10,7 +10,6 @@ #include #include -typedef u32 ppc_opcode_t; #define BREAKPOINT_INSTRUCTION 0x7fe00008 /* trap */ /* Trap definitions per ISA */ diff --git a/arch/powerpc/include/asm/uprobes.h b/arch/powerpc/include/asm/uprobes.h index a7ae1860115a..4fea116d3d37 100644 --- a/arch/powerpc/include/asm/uprobes.h +++ b/arch/powerpc/include/asm/uprobes.h @@ -12,7 +12,7 @@ #include #include -typedef ppc_opcode_t uprobe_opcode_t; +typedef u32 uprobe_opcode_t; #define MAX_UINSN_BYTES 8 #define UPROBE_XOL_SLOT_BYTES (MAX_UINSN_BYTES) -- cgit v1.2.3 From d00d762daf1278641eec92d74011a7e625e84a68 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Jul 2022 16:55:15 +0200 Subject: powerpc/ppc-opcode: Define and use PPC_RAW_TRAP() and PPC_RAW_TW() Add and use PPC_RAW_TRAP() instead of opencoding. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/52c7e522e56a38e3ff0363906919445920005a8f.1657205708.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 2 ++ arch/powerpc/include/asm/probes.h | 3 ++- arch/powerpc/xmon/xmon.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 89beabf5325c..5527a955fb4a 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -581,6 +581,8 @@ #define PPC_RAW_BRANCH(offset) (0x48000000 | PPC_LI(offset)) #define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) +#define PPC_RAW_TW(t0, a, b) (0x7f000008 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0) /* Deal with instructions that older assemblers aren't aware of */ #define PPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h index 00634e3145e7..e77a2ed7d938 100644 --- a/arch/powerpc/include/asm/probes.h +++ b/arch/powerpc/include/asm/probes.h @@ -9,8 +9,9 @@ */ #include #include +#include -#define BREAKPOINT_INSTRUCTION 0x7fe00008 /* trap */ +#define BREAKPOINT_INSTRUCTION PPC_RAW_TRAP() /* trap */ /* Trap definitions per ISA */ #define IS_TW(instr) (((instr) & 0xfc0007fe) == 0x7c000008) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f80c714f1d49..26ef3388c24c 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -116,7 +116,7 @@ struct bpt { static struct bpt bpts[NBPTS]; static struct bpt dabr[HBP_NUM_MAX]; static struct bpt *iabr; -static unsigned bpinstr = 0x7fe00008; /* trap */ +static unsigned int bpinstr = PPC_RAW_TRAP(); #define BP_NUM(bp) ((bp) - bpts + 1) -- cgit v1.2.3 From de40303b54bc458d7df0d4b4ee1d296df7fe98c7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Jul 2022 16:55:16 +0200 Subject: powerpc/ppc-opcode: Define and use PPC_RAW_SETB() We have PPC_INST_SETB then build the 'setb' instruction in the user. Instead, define PPC_RAW_SETB() and use it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b08a4f26919a8f8cdcf7544ab552d9c1c63418b5.1657205708.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 2 +- arch/powerpc/lib/test_emulate_step.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 5527a955fb4a..7b81b37a191e 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -290,7 +290,6 @@ #define PPC_INST_STRING 0x7c00042a #define PPC_INST_STRING_MASK 0xfc0007fe #define PPC_INST_STRING_GEN_MASK 0xfc00067e -#define PPC_INST_SETB 0x7c000100 #define PPC_INST_STSWI 0x7c0005aa #define PPC_INST_STSWX 0x7c00052a #define PPC_INST_TRECHKPT 0x7c0007dd @@ -583,6 +582,7 @@ #define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) #define PPC_RAW_TW(t0, a, b) (0x7f000008 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0) +#define PPC_RAW_SETB(t, bfa) (0x7c000100 | ___PPC_RT(t) | ___PPC_RA((bfa) << 2)) /* Deal with instructions that older assemblers aren't aware of */ #define PPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index f2e47be05e8c..23c7805fb7b3 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -53,9 +53,6 @@ ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ PPC_RAW_ADDI(t, a, i)) -#define TEST_SETB(t, bfa) ppc_inst(PPC_INST_SETB | ___PPC_RT(t) | ___PPC_RA((bfa & 0x7) << 2)) - - static void __init init_pt_regs(struct pt_regs *regs) { static unsigned long msr; @@ -935,21 +932,21 @@ static struct compute_test compute_tests[] = { .subtests = { { .descr = "BFA = 1, CR = GT", - .instr = TEST_SETB(20, 1), + .instr = ppc_inst(PPC_RAW_SETB(20, 1)), .regs = { .ccr = 0x4000000, } }, { .descr = "BFA = 4, CR = LT", - .instr = TEST_SETB(20, 4), + .instr = ppc_inst(PPC_RAW_SETB(20, 4)), .regs = { .ccr = 0x8000, } }, { .descr = "BFA = 5, CR = EQ", - .instr = TEST_SETB(20, 5), + .instr = ppc_inst(PPC_RAW_SETB(20, 5)), .regs = { .ccr = 0x200, } -- cgit v1.2.3 From 2255411d1d0f0661d1e5acd5f6edf4e6652a345a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Jul 2022 16:19:32 +0200 Subject: powerpc/44x: Fix build failure with GCC 12 (unrecognized opcode: `wrteei') Building ppc40x_defconfig leads to following error CC arch/powerpc/kernel/idle.o {standard input}: Assembler messages: {standard input}:67: Error: unrecognized opcode: `wrteei' {standard input}:78: Error: unrecognized opcode: `wrteei' Add -mcpu=440 by default and alternatively 464 and 476. Once that's done, -mcpu=powerpc is only for book3s/32 now. But then comes CC arch/powerpc/kernel/io.o {standard input}: Assembler messages: {standard input}:198: Error: unrecognized opcode: `eieio' {standard input}:230: Error: unrecognized opcode: `eieio' {standard input}:245: Error: unrecognized opcode: `eieio' {standard input}:254: Error: unrecognized opcode: `eieio' {standard input}:273: Error: unrecognized opcode: `eieio' {standard input}:396: Error: unrecognized opcode: `eieio' {standard input}:404: Error: unrecognized opcode: `eieio' {standard input}:423: Error: unrecognized opcode: `eieio' {standard input}:512: Error: unrecognized opcode: `eieio' {standard input}:520: Error: unrecognized opcode: `eieio' {standard input}:539: Error: unrecognized opcode: `eieio' {standard input}:628: Error: unrecognized opcode: `eieio' {standard input}:636: Error: unrecognized opcode: `eieio' {standard input}:655: Error: unrecognized opcode: `eieio' Fix it by replacing eieio by mbar on booke. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b0d982e223314ed82ab959f5d4ad2c4c00bedb99.1657549153.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/barrier.h | 2 ++ arch/powerpc/include/asm/nohash/pgtable.h | 2 +- arch/powerpc/include/asm/synch.h | 5 ++++- arch/powerpc/mm/nohash/tlb_low.S | 4 ++-- arch/powerpc/platforms/Kconfig.cputype | 17 ++++++++++++++++- arch/powerpc/sysdev/fsl_rio.c | 12 ++++++++++-- 6 files changed, 35 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index f0e687236484..ef2d8b15eaab 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -42,6 +42,8 @@ /* The sub-arch has lwsync */ #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC) # define SMPWMB LWSYNC +#elif defined(CONFIG_BOOKE) +# define SMPWMB mbar #else # define SMPWMB eieio #endif diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index ac75f4ab0dba..b499da6c1a99 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -193,7 +193,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, if (IS_ENABLED(CONFIG_PPC32) && IS_ENABLED(CONFIG_PTE_64BIT) && !percpu) { __asm__ __volatile__("\ stw%X0 %2,%0\n\ - eieio\n\ + mbar\n\ stw%X1 %L2,%1" : "=m" (*ptep), "=m" (*((unsigned char *)ptep+4)) : "r" (pte) : "memory"); diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index 1d67bc8d7bc6..7130176d8cb8 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -14,7 +14,10 @@ extern void do_lwsync_fixups(unsigned long value, void *fixup_start, static inline void eieio(void) { - __asm__ __volatile__ ("eieio" : : : "memory"); + if (IS_ENABLED(CONFIG_BOOKE)) + __asm__ __volatile__ ("mbar" : : : "memory"); + else + __asm__ __volatile__ ("eieio" : : : "memory"); } static inline void isync(void) diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index dd39074de9af..d62b613a0d5d 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -186,7 +186,7 @@ _GLOBAL(_tlbivax_bcast) isync PPC_TLBIVAX(0, R3) isync - eieio + mbar tlbsync BEGIN_FTR_SECTION b 1f @@ -355,7 +355,7 @@ _GLOBAL(_tlbivax_bcast) rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND 1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ PPC_TLBIVAX(0,R3) - eieio + mbar tlbsync sync wrtee r10 diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e14000557ebd..3cc8452b8660 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -137,7 +137,7 @@ config GENERIC_CPU config POWERPC_CPU bool "Generic 32 bits powerpc" - depends on PPC32 && !PPC_8xx && !PPC_85xx && !40x + depends on PPC_BOOK3S_32 config CELL_CPU bool "Cell Broadband Engine" @@ -183,6 +183,18 @@ config 405_CPU bool "40x family" depends on 40x +config 440_CPU + bool "440 (44x family)" + depends on 44x + +config 464_CPU + bool "464 (44x family)" + depends on 44x + +config 476_CPU + bool "476 (47x family)" + depends on PPC_47x + config 860_CPU bool "8xx family" depends on PPC_8xx @@ -228,6 +240,9 @@ config TARGET_CPU default "power8" if POWER8_CPU default "power9" if POWER9_CPU default "405" if 405_CPU + default "440" if 440_CPU + default "464" if 464_CPU + default "476" if 476_CPU default "860" if 860_CPU default "e300c2" if E300C2_CPU default "e300c3" if E300C3_CPU diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 1bfc9afa8a1a..4647c6074f3b 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -69,10 +69,10 @@ static DEFINE_SPINLOCK(fsl_rio_config_lock); -#define __fsl_read_rio_config(x, addr, err, op) \ +#define ___fsl_read_rio_config(x, addr, err, op, barrier) \ __asm__ __volatile__( \ "1: "op" %1,0(%2)\n" \ - " eieio\n" \ + " "barrier"\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: li %1,-1\n" \ @@ -83,6 +83,14 @@ static DEFINE_SPINLOCK(fsl_rio_config_lock); : "=r" (err), "=r" (x) \ : "b" (addr), "i" (-EFAULT), "0" (err)) +#ifdef CONFIG_BOOKE +#define __fsl_read_rio_config(x, addr, err, op) \ + ___fsl_read_rio_config(x, addr, err, op, "mbar") +#else +#define __fsl_read_rio_config(x, addr, err, op) \ + ___fsl_read_rio_config(x, addr, err, op, "eieio") +#endif + void __iomem *rio_regs_win; void __iomem *rmu_regs_win; resource_size_t rio_law_start; -- cgit v1.2.3 From 4d5c5bad51935482437528f7fa4dffdcb3330d8b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Jul 2022 09:11:07 +0200 Subject: powerpc: Remove asm/prom.h from asm/mpc52xx.h and asm/pci.h asm/pci.h and asm/mpc52xx.h don't need asm/prom.h Declare struct device_node locally to avoid including of.h Signed-off-by: Christophe Leroy [mpe: Add missing include of prom.h to of_rtc.c] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cf5243343e2364c2b40f22ee5ad9a6e2453d1121.1657264228.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/mpc52xx.h | 3 ++- arch/powerpc/include/asm/pci.h | 1 - arch/powerpc/kernel/prom.c | 1 + arch/powerpc/sysdev/of_rtc.c | 2 ++ 4 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h index ddd80aae1e32..5ea16a71c2f0 100644 --- a/arch/powerpc/include/asm/mpc52xx.h +++ b/arch/powerpc/include/asm/mpc52xx.h @@ -15,7 +15,6 @@ #ifndef __ASSEMBLY__ #include -#include #include #endif /* __ASSEMBLY__ */ @@ -268,6 +267,8 @@ struct mpc52xx_intr { #ifndef __ASSEMBLY__ +struct device_node; + /* mpc52xx_common.c */ extern void mpc5200_setup_xlb_arbiter(void); extern void mpc52xx_declare_of_platform_devices(void); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 915d6ee4b40a..0f182074cdb7 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -14,7 +14,6 @@ #include #include -#include #include /* Return values for pci_controller_ops.probe_mode function */ diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1066b072db35..0a68c99da573 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -54,6 +54,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/sysdev/of_rtc.c b/arch/powerpc/sysdev/of_rtc.c index 1f408d34a6a7..420f949b7485 100644 --- a/arch/powerpc/sysdev/of_rtc.c +++ b/arch/powerpc/sysdev/of_rtc.c @@ -11,6 +11,8 @@ #include #include +#include + static __initdata struct { const char *compatible; char *plat_name; -- cgit v1.2.3 From 36afe68714d45edf34430d28e3dc787425ad8b22 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Jul 2022 09:11:08 +0200 Subject: powerpc: Finally remove unnecessary headers from asm/prom.h Remove all headers included from asm/prom.h which are not used by asm/prom.h itself. Declare struct device_node and struct property locally to avoid including of.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4be954abef978b34cff9193fc566ffefdd3517bb.1657264228.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/prom.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 6f109b5cb84e..2e82820fbd64 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -12,16 +12,10 @@ * Updates for PPC64 by Peter Bergner & David Engebretsen, IBM Corp. */ #include -#include -#include #include -/* These includes should be removed once implicit includes are cleaned up. */ -#include -#include -#include -#include -#include +struct device_node; +struct property; #define OF_DT_BEGIN_NODE 0x1 /* Start of node, full name */ #define OF_DT_END_NODE 0x2 /* End node */ -- cgit v1.2.3 From c7255058b5430b5c42932383bd8887d591e7973a Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 30 Jun 2022 12:19:42 +0530 Subject: powerpc/crash: save cpu register data in crash_smp_send_stop() During kdump, two set of NMI IPIs are sent to secondary CPUs, if 'crash_kexec_post_notifiers' option is set. The first set of NMI IPIs to stop the CPUs and the other set to collect register data. Instead, capture register data for secondary CPUs while stopping them itself. Also, fallback to smp_send_stop() in case the function gets called without kdump configured. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220630064942.192283-1-hbathini@linux.ibm.com --- arch/powerpc/include/asm/kexec.h | 1 + arch/powerpc/kernel/smp.c | 29 +++++---------- arch/powerpc/kexec/crash.c | 77 ++++++++++++++++++++++++---------------- 3 files changed, 57 insertions(+), 50 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 2aefe14e1442..cce69101205e 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -83,6 +83,7 @@ extern void default_machine_crash_shutdown(struct pt_regs *regs); extern int crash_shutdown_register(crash_shutdown_t handler); extern int crash_shutdown_unregister(crash_shutdown_t handler); +extern void crash_kexec_prepare(void); extern void crash_kexec_secondary(struct pt_regs *regs); int __init overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index bcefab484ea6..6b850c157a62 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -55,7 +56,6 @@ #endif #include #include -#include #include #include #include @@ -619,20 +619,6 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) } #endif -#ifdef CONFIG_NMI_IPI -static void crash_stop_this_cpu(struct pt_regs *regs) -#else -static void crash_stop_this_cpu(void *dummy) -#endif -{ - /* - * Just busy wait here and avoid marking CPU as offline to ensure - * register data is captured appropriately. - */ - while (1) - cpu_relax(); -} - void crash_smp_send_stop(void) { static bool stopped = false; @@ -651,11 +637,14 @@ void crash_smp_send_stop(void) stopped = true; -#ifdef CONFIG_NMI_IPI - smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_stop_this_cpu, 1000000); -#else - smp_call_function(crash_stop_this_cpu, NULL, 0); -#endif /* CONFIG_NMI_IPI */ +#ifdef CONFIG_KEXEC_CORE + if (kexec_crash_image) { + crash_kexec_prepare(); + return; + } +#endif + + smp_send_stop(); } #ifdef CONFIG_NMI_IPI diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 80f54723cf6d..252724ed666a 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -40,6 +40,14 @@ #define REAL_MODE_TIMEOUT 10000 static int time_to_dump; + +/* + * In case of system reset, secondary CPUs enter crash_kexec_secondary with out + * having to send an IPI explicitly. So, indicate if the crash is via + * system reset to avoid sending another IPI. + */ +static int is_via_system_reset; + /* * crash_wake_offline should be set to 1 by platforms that intend to wake * up offline cpus prior to jumping to a kdump kernel. Currently powernv @@ -101,7 +109,7 @@ void crash_ipi_callback(struct pt_regs *regs) /* NOTREACHED */ } -static void crash_kexec_prepare_cpus(int cpu) +static void crash_kexec_prepare_cpus(void) { unsigned int msecs; volatile unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ @@ -113,7 +121,15 @@ static void crash_kexec_prepare_cpus(int cpu) if (crash_wake_offline) ncpus = num_present_cpus() - 1; - crash_send_ipi(crash_ipi_callback); + /* + * If we came in via system reset, secondaries enter via crash_kexec_secondary(). + * So, wait a while for the secondary CPUs to enter for that case. + * Else, send IPI to all other CPUs. + */ + if (is_via_system_reset) + mdelay(PRIMARY_TIMEOUT); + else + crash_send_ipi(crash_ipi_callback); smp_wmb(); again: @@ -202,7 +218,7 @@ void crash_kexec_secondary(struct pt_regs *regs) #else /* ! CONFIG_SMP */ -static void crash_kexec_prepare_cpus(int cpu) +static void crash_kexec_prepare_cpus(void) { /* * move the secondaries to us so that we can copy @@ -248,6 +264,32 @@ noinstr static void __maybe_unused crash_kexec_wait_realmode(int cpu) static inline void crash_kexec_wait_realmode(int cpu) {} #endif /* CONFIG_SMP && CONFIG_PPC64 */ +void crash_kexec_prepare(void) +{ + /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ + printk_deferred_enter(); + + /* + * This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means stopping other cpus in + * an SMP system. + * The kernel is broken so disable interrupts. + */ + hard_irq_disable(); + + /* + * Make a note of crashing cpu. Will be used in machine_kexec + * such that another IPI will not be sent. + */ + crashing_cpu = smp_processor_id(); + + crash_kexec_prepare_cpus(); +} + /* * Register a function to be called on shutdown. Only use this if you * can't reset your device in the second kernel. @@ -311,35 +353,10 @@ void default_machine_crash_shutdown(struct pt_regs *regs) unsigned int i; int (*old_handler)(struct pt_regs *regs); - /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ - printk_deferred_enter(); - - /* - * This function is only called after the system - * has panicked or is otherwise in a critical state. - * The minimum amount of code to allow a kexec'd kernel - * to run successfully needs to happen here. - * - * In practice this means stopping other cpus in - * an SMP system. - * The kernel is broken so disable interrupts. - */ - hard_irq_disable(); - - /* - * Make a note of crashing cpu. Will be used in machine_kexec - * such that another IPI will not be sent. - */ - crashing_cpu = smp_processor_id(); - - /* - * If we came in via system reset, wait a while for the secondary - * CPUs to enter. - */ if (TRAP(regs) == INTERRUPT_SYSTEM_RESET) - mdelay(PRIMARY_TIMEOUT); + is_via_system_reset = 1; - crash_kexec_prepare_cpus(crashing_cpu); + crash_smp_send_stop(); crash_save_cpu(regs, crashing_cpu); -- cgit v1.2.3 From 2454a7af0f2a42918aa972147a0bec38e6656cd8 Mon Sep 17 00:00:00 2001 From: Nayna Jain Date: Sat, 23 Jul 2022 07:30:46 -0400 Subject: powerpc/pseries: define driver for Platform KeyStore PowerVM provides an isolated Platform Keystore(PKS) storage allocation for each LPAR with individually managed access controls to store sensitive information securely. It provides a new set of hypervisor calls for Linux kernel to access PKS storage. Define POWER LPAR Platform KeyStore(PLPKS) driver using H_CALL interface to access PKS storage. Signed-off-by: Nayna Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220723113048.521744-2-nayna@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 11 + arch/powerpc/platforms/pseries/Kconfig | 13 + arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/plpks.c | 460 ++++++++++++++++++++++++++++++++ arch/powerpc/platforms/pseries/plpks.h | 71 +++++ 5 files changed, 556 insertions(+) create mode 100644 arch/powerpc/platforms/pseries/plpks.c create mode 100644 arch/powerpc/platforms/pseries/plpks.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 4457abe31e6e..8abae463f6c1 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -79,6 +79,7 @@ #define H_NOT_ENOUGH_RESOURCES -44 #define H_R_STATE -45 #define H_RESCINDED -46 +#define H_P1 -54 #define H_P2 -55 #define H_P3 -56 #define H_P4 -57 @@ -98,6 +99,8 @@ #define H_OP_MODE -73 #define H_COP_HW -74 #define H_STATE -75 +#define H_IN_USE -77 +#define H_ABORTED -78 #define H_UNSUPPORTED_FLAG_START -256 #define H_UNSUPPORTED_FLAG_END -511 #define H_MULTI_THREADS_ACTIVE -9005 @@ -322,6 +325,14 @@ #define H_SCM_UNBIND_ALL 0x3FC #define H_SCM_HEALTH 0x400 #define H_SCM_PERFORMANCE_STATS 0x418 +#define H_PKS_GET_CONFIG 0x41C +#define H_PKS_SET_PASSWORD 0x420 +#define H_PKS_GEN_PASSWORD 0x424 +#define H_PKS_WRITE_OBJECT 0x42C +#define H_PKS_GEN_KEY 0x430 +#define H_PKS_READ_OBJECT 0x434 +#define H_PKS_REMOVE_OBJECT 0x438 +#define H_PKS_CONFIRM_OBJECT_FLUSHED 0x43C #define H_RPT_INVALIDATE 0x448 #define H_SCM_FLUSH 0x44C #define H_GET_ENERGY_SCALE_INFO 0x450 diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7fd91d153a4..c4a6d4083a7a 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -142,6 +142,19 @@ config IBMEBUS help Bus device driver for GX bus based adapters. +config PSERIES_PLPKS + depends on PPC_PSERIES + bool "Support for the Platform Key Storage" + help + PowerVM provides an isolated Platform Keystore(PKS) storage + allocation for each LPAR with individually managed access + controls to store sensitive information securely. It can be + used to store asymmetric public keys or secrets as required + by different usecases. Select this config to enable + operating system interface to hypervisor to access this space. + + If unsure, select N. + config PAPR_SCM depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM tristate "Support for the PAPR Storage Class Memory interface" diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 7aaff5323544..14e143b946a3 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_SVM) += svm.o obj-$(CONFIG_FA_DUMP) += rtas-fadump.o +obj-$(CONFIG_PSERIES_PLPKS) += plpks.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c new file mode 100644 index 000000000000..52aaa2894606 --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * POWER LPAR Platform KeyStore(PLPKS) + * Copyright (C) 2022 IBM Corporation + * Author: Nayna Jain + * + * Provides access to variables stored in Power LPAR Platform KeyStore(PLPKS). + */ + +#define pr_fmt(fmt) "plpks: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "plpks.h" + +#define PKS_FW_OWNER 0x1 +#define PKS_BOOTLOADER_OWNER 0x2 +#define PKS_OS_OWNER 0x3 + +#define LABEL_VERSION 0 +#define MAX_LABEL_ATTR_SIZE 16 +#define MAX_NAME_SIZE 239 +#define MAX_DATA_SIZE 4000 + +#define PKS_FLUSH_MAX_TIMEOUT 5000 //msec +#define PKS_FLUSH_SLEEP 10 //msec +#define PKS_FLUSH_SLEEP_RANGE 400 + +static u8 *ospassword; +static u16 ospasswordlength; + +// Retrieved with H_PKS_GET_CONFIG +static u16 maxpwsize; +static u16 maxobjsize; + +struct plpks_auth { + u8 version; + u8 consumer; + __be64 rsvd0; + __be32 rsvd1; + __be16 passwordlength; + u8 password[]; +} __packed __aligned(16); + +struct label_attr { + u8 prefix[8]; + u8 version; + u8 os; + u8 length; + u8 reserved[5]; +}; + +struct label { + struct label_attr attr; + u8 name[MAX_NAME_SIZE]; + size_t size; +}; + +static int pseries_status_to_err(int rc) +{ + int err; + + switch (rc) { + case H_SUCCESS: + err = 0; + break; + case H_FUNCTION: + err = -ENXIO; + break; + case H_P1: + case H_P2: + case H_P3: + case H_P4: + case H_P5: + case H_P6: + err = -EINVAL; + break; + case H_NOT_FOUND: + err = -ENOENT; + break; + case H_BUSY: + err = -EBUSY; + break; + case H_AUTHORITY: + err = -EPERM; + break; + case H_NO_MEM: + err = -ENOMEM; + break; + case H_RESOURCE: + err = -EEXIST; + break; + case H_TOO_BIG: + err = -EFBIG; + break; + case H_STATE: + err = -EIO; + break; + case H_R_STATE: + err = -EIO; + break; + case H_IN_USE: + err = -EEXIST; + break; + case H_ABORTED: + err = -EINTR; + break; + default: + err = -EINVAL; + } + + return err; +} + +static int plpks_gen_password(void) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + u8 *password, consumer = PKS_OS_OWNER; + int rc; + + password = kzalloc(maxpwsize, GFP_KERNEL); + if (!password) + return -ENOMEM; + + rc = plpar_hcall(H_PKS_GEN_PASSWORD, retbuf, consumer, 0, + virt_to_phys(password), maxpwsize); + + if (!rc) { + ospasswordlength = maxpwsize; + ospassword = kzalloc(maxpwsize, GFP_KERNEL); + if (!ospassword) { + kfree(password); + return -ENOMEM; + } + memcpy(ospassword, password, ospasswordlength); + } else { + if (rc == H_IN_USE) { + pr_warn("Password is already set for POWER LPAR Platform KeyStore\n"); + rc = 0; + } else { + goto out; + } + } +out: + kfree(password); + + return pseries_status_to_err(rc); +} + +static struct plpks_auth *construct_auth(u8 consumer) +{ + struct plpks_auth *auth; + + if (consumer > PKS_OS_OWNER) + return ERR_PTR(-EINVAL); + + auth = kmalloc(struct_size(auth, password, maxpwsize), GFP_KERNEL); + if (!auth) + return ERR_PTR(-ENOMEM); + + auth->version = 1; + auth->consumer = consumer; + auth->rsvd0 = 0; + auth->rsvd1 = 0; + + if (consumer == PKS_FW_OWNER || consumer == PKS_BOOTLOADER_OWNER) { + auth->passwordlength = 0; + return auth; + } + + memcpy(auth->password, ospassword, ospasswordlength); + + auth->passwordlength = cpu_to_be16(ospasswordlength); + + return auth; +} + +/** + * Label is combination of label attributes + name. + * Label attributes are used internally by kernel and not exposed to the user. + */ +static struct label *construct_label(char *component, u8 varos, u8 *name, + u16 namelen) +{ + struct label *label; + size_t slen; + + if (!name || namelen > MAX_NAME_SIZE) + return ERR_PTR(-EINVAL); + + slen = strlen(component); + if (component && slen > sizeof(label->attr.prefix)) + return ERR_PTR(-EINVAL); + + label = kzalloc(sizeof(*label), GFP_KERNEL); + if (!label) + return ERR_PTR(-ENOMEM); + + if (component) + memcpy(&label->attr.prefix, component, slen); + + label->attr.version = LABEL_VERSION; + label->attr.os = varos; + label->attr.length = MAX_LABEL_ATTR_SIZE; + memcpy(&label->name, name, namelen); + + label->size = sizeof(struct label_attr) + namelen; + + return label; +} + +static int _plpks_get_config(void) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct { + u8 version; + u8 flags; + __be32 rsvd0; + __be16 maxpwsize; + __be16 maxobjlabelsize; + __be16 maxobjsize; + __be32 totalsize; + __be32 usedspace; + __be32 supportedpolicies; + __be64 rsvd1; + } __packed config; + size_t size; + int rc; + + size = sizeof(config); + + rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(&config), size); + + if (rc != H_SUCCESS) + return pseries_status_to_err(rc); + + maxpwsize = be16_to_cpu(config.maxpwsize); + maxobjsize = be16_to_cpu(config.maxobjsize); + + return 0; +} + +static int plpks_confirm_object_flushed(struct label *label, + struct plpks_auth *auth) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + u64 timeout = 0; + u8 status; + int rc; + + do { + rc = plpar_hcall(H_PKS_CONFIRM_OBJECT_FLUSHED, retbuf, + virt_to_phys(auth), virt_to_phys(label), + label->size); + + status = retbuf[0]; + if (rc) { + if (rc == H_NOT_FOUND && status == 1) + rc = 0; + break; + } + + if (!rc && status == 1) + break; + + usleep_range(PKS_FLUSH_SLEEP, + PKS_FLUSH_SLEEP + PKS_FLUSH_SLEEP_RANGE); + timeout = timeout + PKS_FLUSH_SLEEP; + } while (timeout < PKS_FLUSH_MAX_TIMEOUT); + + rc = pseries_status_to_err(rc); + + return rc; +} + +int plpks_write_var(struct plpks_var var) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label; + int rc; + + if (!var.component || !var.data || var.datalen <= 0 || + var.namelen > MAX_NAME_SIZE || var.datalen > MAX_DATA_SIZE) + return -EINVAL; + + if (var.policy & SIGNEDUPDATE) + return -EINVAL; + + auth = construct_auth(PKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(var.component, var.os, var.name, var.namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + rc = plpar_hcall(H_PKS_WRITE_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size, var.policy, + virt_to_phys(var.data), var.datalen); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + if (rc) + pr_err("Failed to write variable %s for component %s with error %d\n", + var.name, var.component, rc); + + rc = pseries_status_to_err(rc); + kfree(label); +out: + kfree(auth); + + return rc; +} + +int plpks_remove_var(char *component, u8 varos, struct plpks_var_name vname) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label; + int rc; + + if (!component || vname.namelen > MAX_NAME_SIZE) + return -EINVAL; + + auth = construct_auth(PKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(component, varos, vname.name, vname.namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + rc = plpar_hcall(H_PKS_REMOVE_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + if (rc) + pr_err("Failed to remove variable %s for component %s with error %d\n", + vname.name, component, rc); + + rc = pseries_status_to_err(rc); + kfree(label); +out: + kfree(auth); + + return rc; +} + +static int plpks_read_var(u8 consumer, struct plpks_var *var) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label; + u8 *output; + int rc; + + if (var->namelen > MAX_NAME_SIZE) + return -EINVAL; + + auth = construct_auth(PKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(var->component, var->os, var->name, + var->namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out_free_auth; + } + + output = kzalloc(maxobjsize, GFP_KERNEL); + if (!output) { + rc = -ENOMEM; + goto out_free_label; + } + + rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size, virt_to_phys(output), + maxobjsize); + + if (rc != H_SUCCESS) { + pr_err("Failed to read variable %s for component %s with error %d\n", + var->name, var->component, rc); + rc = pseries_status_to_err(rc); + goto out_free_output; + } + + if (var->datalen == 0 || var->datalen > retbuf[0]) + var->datalen = retbuf[0]; + + var->data = kzalloc(var->datalen, GFP_KERNEL); + if (!var->data) { + rc = -ENOMEM; + goto out_free_output; + } + var->policy = retbuf[1]; + + memcpy(var->data, output, var->datalen); + rc = 0; + +out_free_output: + kfree(output); +out_free_label: + kfree(label); +out_free_auth: + kfree(auth); + + return rc; +} + +int plpks_read_os_var(struct plpks_var *var) +{ + return plpks_read_var(PKS_OS_OWNER, var); +} + +int plpks_read_fw_var(struct plpks_var *var) +{ + return plpks_read_var(PKS_FW_OWNER, var); +} + +int plpks_read_bootloader_var(struct plpks_var *var) +{ + return plpks_read_var(PKS_BOOTLOADER_OWNER, var); +} + +static __init int pseries_plpks_init(void) +{ + int rc; + + rc = _plpks_get_config(); + + if (rc) { + pr_err("POWER LPAR Platform KeyStore is not supported or enabled\n"); + return rc; + } + + rc = plpks_gen_password(); + if (rc) + pr_err("Failed setting POWER LPAR Platform KeyStore Password\n"); + else + pr_info("POWER LPAR Platform KeyStore initialized successfully\n"); + + return rc; +} +arch_initcall(pseries_plpks_init); diff --git a/arch/powerpc/platforms/pseries/plpks.h b/arch/powerpc/platforms/pseries/plpks.h new file mode 100644 index 000000000000..c6a291367bb1 --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 IBM Corporation + * Author: Nayna Jain + * + * Platform keystore for pseries LPAR(PLPKS). + */ + +#ifndef _PSERIES_PLPKS_H +#define _PSERIES_PLPKS_H + +#include +#include + +#define OSSECBOOTAUDIT 0x40000000 +#define OSSECBOOTENFORCE 0x20000000 +#define WORLDREADABLE 0x08000000 +#define SIGNEDUPDATE 0x01000000 + +#define PLPKS_VAR_LINUX 0x01 +#define PLPKS_VAR_COMMON 0x04 + +struct plpks_var { + char *component; + u8 *name; + u8 *data; + u32 policy; + u16 namelen; + u16 datalen; + u8 os; +}; + +struct plpks_var_name { + u8 *name; + u16 namelen; +}; + +struct plpks_var_name_list { + u32 varcount; + struct plpks_var_name varlist[]; +}; + +/** + * Writes the specified var and its data to PKS. + * Any caller of PKS driver should present a valid component type for + * their variable. + */ +int plpks_write_var(struct plpks_var var); + +/** + * Removes the specified var and its data from PKS. + */ +int plpks_remove_var(char *component, u8 varos, + struct plpks_var_name vname); + +/** + * Returns the data for the specified os variable. + */ +int plpks_read_os_var(struct plpks_var *var); + +/** + * Returns the data for the specified firmware variable. + */ +int plpks_read_fw_var(struct plpks_var *var); + +/** + * Returns the data for the specified bootloader variable. + */ +int plpks_read_bootloader_var(struct plpks_var *var); + +#endif -- cgit v1.2.3 From 7ef3d06f1bc4a5e62273726f3dc2bd258ae1c71f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Jul 2022 00:32:18 +1000 Subject: powerpc/powernv/kvm: Use darn for H_RANDOM on Power9 The existing logic in KVM to support guests calling H_RANDOM only works on Power8, because it looks for an RNG in the device tree, but on Power9 we just use darn. In addition the existing code needs to work in real mode, so we have the special cased powernv_get_random_real_mode() to deal with that. Instead just have KVM call ppc_md.get_random_seed(), and do the real mode check inside of there, that way we use whatever RNG is available, including darn on Power9. Fixes: e928e9cb3601 ("KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation.") Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Jason A. Donenfeld Tested-by: Sachin Sant [mpe: Rebase on previous commit, update change log appropriately] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220727143219.2684192-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/archrandom.h | 5 ----- arch/powerpc/kvm/book3s_hv_builtin.c | 7 ++++--- arch/powerpc/platforms/powernv/rng.c | 36 ++++++++--------------------------- 3 files changed, 12 insertions(+), 36 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 9a53e29680f4..258174304904 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -38,12 +38,7 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) #endif /* CONFIG_ARCH_RANDOM */ #ifdef CONFIG_PPC_POWERNV -int powernv_hwrng_present(void); int powernv_get_random_long(unsigned long *v); -int powernv_get_random_real_mode(unsigned long *v); -#else -static inline int powernv_hwrng_present(void) { return 0; } -static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } #endif #endif /* _ASM_POWERPC_ARCHRANDOM_H */ diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 88a8f6473c4e..3abaef5f9ac2 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -176,13 +176,14 @@ EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); int kvmppc_hwrng_present(void) { - return powernv_hwrng_present(); + return ppc_md.get_random_seed != NULL; } EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); long kvmppc_rm_h_random(struct kvm_vcpu *vcpu) { - if (powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4])) + if (ppc_md.get_random_seed && + ppc_md.get_random_seed(&vcpu->arch.regs.gpr[4])) return H_SUCCESS; return H_HARDWARE; diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 2287c9cd0cd5..d19305292e1e 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -29,15 +29,6 @@ struct powernv_rng { static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); -int powernv_hwrng_present(void) -{ - struct powernv_rng *rng; - - rng = get_cpu_var(powernv_rng); - put_cpu_var(rng); - return rng != NULL; -} - static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) { unsigned long parity; @@ -58,19 +49,6 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) return val; } -int powernv_get_random_real_mode(unsigned long *v) -{ - struct powernv_rng *rng; - - rng = raw_cpu_read(powernv_rng); - if (!rng) - return 0; - - *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); - - return 1; -} - static int powernv_get_random_darn(unsigned long *v) { unsigned long val; @@ -107,12 +85,14 @@ int powernv_get_random_long(unsigned long *v) { struct powernv_rng *rng; - rng = get_cpu_var(powernv_rng); - - *v = rng_whiten(rng, in_be64(rng->regs)); - - put_cpu_var(rng); - + if (mfmsr() & MSR_DR) { + rng = get_cpu_var(powernv_rng); + *v = rng_whiten(rng, in_be64(rng->regs)); + put_cpu_var(rng); + } else { + rng = raw_cpu_read(powernv_rng); + *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); + } return 1; } EXPORT_SYMBOL_GPL(powernv_get_random_long); -- cgit v1.2.3 From 978030f054ff97d9079b35f0178e2013918fb316 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Jul 2022 00:32:19 +1000 Subject: powerpc/powernv: rename remaining rng powernv_ functions to pnv_ The preferred nomenclature is pnv_, not powernv_, but rng.c used powernv_ for some reason, which isn't consistent with the rest. A recent commit added a few pnv_ functions to rng.c, making the file a bit of a mishmash. This commit just replaces the rest of them. Fixes: f3eac426657d ("powerpc/powernv: wire up rng during setup_arch") Signed-off-by: Jason A. Donenfeld Tested-by: Sachin Sant [mpe: Reorder after bug fix commits] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220727143219.2684192-3-mpe@ellerman.id.au --- arch/powerpc/include/asm/archrandom.h | 2 +- arch/powerpc/platforms/powernv/rng.c | 34 +++++++++++++++++----------------- drivers/char/hw_random/powernv-rng.c | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 258174304904..3af27bb84a3d 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -38,7 +38,7 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) #endif /* CONFIG_ARCH_RANDOM */ #ifdef CONFIG_PPC_POWERNV -int powernv_get_random_long(unsigned long *v); +int pnv_get_random_long(unsigned long *v); #endif #endif /* _ASM_POWERPC_ARCHRANDOM_H */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index d19305292e1e..196aa70fe043 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -21,15 +21,15 @@ #define DARN_ERR 0xFFFFFFFFFFFFFFFFul -struct powernv_rng { +struct pnv_rng { void __iomem *regs; void __iomem *regs_real; unsigned long mask; }; -static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); +static DEFINE_PER_CPU(struct pnv_rng *, pnv_rng); -static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) +static unsigned long rng_whiten(struct pnv_rng *rng, unsigned long val) { unsigned long parity; @@ -49,7 +49,7 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) return val; } -static int powernv_get_random_darn(unsigned long *v) +static int pnv_get_random_darn(unsigned long *v) { unsigned long val; @@ -73,31 +73,31 @@ static int __init initialise_darn(void) return -ENODEV; for (i = 0; i < 10; i++) { - if (powernv_get_random_darn(&val)) { - ppc_md.get_random_seed = powernv_get_random_darn; + if (pnv_get_random_darn(&val)) { + ppc_md.get_random_seed = pnv_get_random_darn; return 0; } } return -EIO; } -int powernv_get_random_long(unsigned long *v) +int pnv_get_random_long(unsigned long *v) { - struct powernv_rng *rng; + struct pnv_rng *rng; if (mfmsr() & MSR_DR) { - rng = get_cpu_var(powernv_rng); + rng = get_cpu_var(pnv_rng); *v = rng_whiten(rng, in_be64(rng->regs)); put_cpu_var(rng); } else { - rng = raw_cpu_read(powernv_rng); + rng = raw_cpu_read(pnv_rng); *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); } return 1; } -EXPORT_SYMBOL_GPL(powernv_get_random_long); +EXPORT_SYMBOL_GPL(pnv_get_random_long); -static __init void rng_init_per_cpu(struct powernv_rng *rng, +static __init void rng_init_per_cpu(struct pnv_rng *rng, struct device_node *dn) { int chip_id, cpu; @@ -107,16 +107,16 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, pr_warn("No ibm,chip-id found for %pOF.\n", dn); for_each_possible_cpu(cpu) { - if (per_cpu(powernv_rng, cpu) == NULL || + if (per_cpu(pnv_rng, cpu) == NULL || cpu_to_chip_id(cpu) == chip_id) { - per_cpu(powernv_rng, cpu) = rng; + per_cpu(pnv_rng, cpu) = rng; } } } static __init int rng_create(struct device_node *dn) { - struct powernv_rng *rng; + struct pnv_rng *rng; struct resource res; unsigned long val; @@ -142,7 +142,7 @@ static __init int rng_create(struct device_node *dn) rng_init_per_cpu(rng, dn); - ppc_md.get_random_seed = powernv_get_random_long; + ppc_md.get_random_seed = pnv_get_random_long; return 0; } @@ -190,7 +190,7 @@ static int __init pnv_rng_late_init(void) if (ppc_md.get_random_seed == pnv_get_random_long_early) pnv_get_random_long_early(&v); - if (ppc_md.get_random_seed == powernv_get_random_long) { + if (ppc_md.get_random_seed == pnv_get_random_long) { for_each_compatible_node(dn, NULL, "ibm,power-rng") of_platform_device_create(dn, NULL, NULL); } diff --git a/drivers/char/hw_random/powernv-rng.c b/drivers/char/hw_random/powernv-rng.c index 8da1d7917bdc..429e956f34e1 100644 --- a/drivers/char/hw_random/powernv-rng.c +++ b/drivers/char/hw_random/powernv-rng.c @@ -23,7 +23,7 @@ static int powernv_rng_read(struct hwrng *rng, void *data, size_t max, bool wait buf = (unsigned long *)data; for (i = 0; i < len; i++) - powernv_get_random_long(buf++); + pnv_get_random_long(buf++); return len * sizeof(unsigned long); } -- cgit v1.2.3 From 3e731203153de1c06a8b7a4f15061e9051c09a6f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 23 Jul 2022 14:45:36 -0700 Subject: powerpc: drop dependency on in archrandom.h archrandom.h includes to refer ppc_md. This causes circular header dependency, if generic nodemask.h includes random.h: In file included from include/linux/cred.h:16, from include/linux/seq_file.h:13, from arch/powerpc/include/asm/machdep.h:6, from arch/powerpc/include/asm/archrandom.h:5, from include/linux/random.h:109, from include/linux/nodemask.h:97, from include/linux/list_lru.h:12, from include/linux/fs.h:13, from include/linux/compat.h:17, from arch/powerpc/kernel/asm-offsets.c:12: include/linux/sched.h:1203:9: error: unknown type name 'nodemask_t' 1203 | nodemask_t mems_allowed; | ^~~~~~~~~~ Fix it by removing dependency from archrandom.h Now as arch_get_random_seed_long() moved to c-file, and not exported, it's not available for modules. As Michael Ellerman says: I think we actually don't need it exported to modules, I think it's a private detail of the RNG <-> architecture interface, not something that modules should be calling. CC: Andy Shevchenko CC: Benjamin Herrenschmidt CC: Michael Ellerman CC: Paul Mackerras CC: Rasmus Villemoes CC: Stephen Rothwell CC: linuxppc-dev@lists.ozlabs.org Suggested-by: Michael Ellerman (for non-exporting) Acked-by: Michael Ellerman Acked-by: Michael Ellerman Signed-off-by: Yury Norov --- arch/powerpc/include/asm/archrandom.h | 9 +-------- arch/powerpc/kernel/setup-common.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 9a53e29680f4..21def59ef1a6 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -4,7 +4,7 @@ #ifdef CONFIG_ARCH_RANDOM -#include +bool __must_check arch_get_random_seed_long(unsigned long *v); static inline bool __must_check arch_get_random_long(unsigned long *v) { @@ -16,13 +16,6 @@ static inline bool __must_check arch_get_random_int(unsigned int *v) return false; } -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - if (ppc_md.get_random_seed) - return ppc_md.get_random_seed(v); - - return false; -} static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index eb0077b302e2..5175726c703d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -171,6 +171,18 @@ EXPORT_SYMBOL_GPL(machine_power_off); void (*pm_power_off)(void); EXPORT_SYMBOL_GPL(pm_power_off); +#ifdef CONFIG_ARCH_RANDOM +bool __must_check arch_get_random_seed_long(unsigned long *v) +{ + if (ppc_md.get_random_seed) + return ppc_md.get_random_seed(v); + + return false; +} +EXPORT_SYMBOL(arch_get_random_seed_long); + +#endif + void machine_halt(void) { machine_shutdown(); -- cgit v1.2.3 From 18db466a9a306406dab3b134014d9f6ed642471c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Aug 2022 11:02:36 +0200 Subject: powerpc: Fix eh field when calling lwarx on PPC32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9401f4e46cf6 ("powerpc: Use lwarx/ldarx directly instead of PPC_LWARX/LDARX macros") properly handled the eh field of lwarx in asm/bitops.h but failed to clear it for PPC32 in asm/simple_spinlock.h So, do as in arch_atomic_try_cmpxchg_lock(), set it to 1 if PPC64 but set it to 0 if PPC32. For that use IS_ENABLED(CONFIG_PPC64) which returns 1 when CONFIG_PPC64 is set and 0 otherwise. Fixes: 9401f4e46cf6 ("powerpc: Use lwarx/ldarx directly instead of PPC_LWARX/LDARX macros") Cc: stable@vger.kernel.org # v5.15+ Reported-by: Pali Rohár Signed-off-by: Christophe Leroy Tested-by: Pali Rohár Reviewed-by: Segher Boessenkool [mpe: Use symbolic names, use 'n' constraint per Segher] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a1176e19e627dd6a1b8d24c6c457a8ab874b7d12.1659430931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/simple_spinlock.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 7ae6aeef8464..9dcc7e9993b9 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -48,10 +48,11 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) { unsigned long tmp, token; + unsigned int eh = IS_ENABLED(CONFIG_PPC64); token = LOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2,1\n\ +"1: lwarx %0,0,%2,%[eh]\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -59,7 +60,7 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) PPC_ACQUIRE_BARRIER "2:" : "=&r" (tmp) - : "r" (token), "r" (&lock->slock) + : "r" (token), "r" (&lock->slock), [eh] "n" (eh) : "cr0", "memory"); return tmp; @@ -156,9 +157,10 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) static inline long __arch_read_trylock(arch_rwlock_t *rw) { long tmp; + unsigned int eh = IS_ENABLED(CONFIG_PPC64); __asm__ __volatile__( -"1: lwarx %0,0,%1,1\n" +"1: lwarx %0,0,%1,%[eh]\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -166,7 +168,7 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw) bne- 1b\n" PPC_ACQUIRE_BARRIER "2:" : "=&r" (tmp) - : "r" (&rw->lock) + : "r" (&rw->lock), [eh] "n" (eh) : "cr0", "xer", "memory"); return tmp; @@ -179,17 +181,18 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw) static inline long __arch_write_trylock(arch_rwlock_t *rw) { long tmp, token; + unsigned int eh = IS_ENABLED(CONFIG_PPC64); token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2,1\n\ +"1: lwarx %0,0,%2,%[eh]\n\ cmpwi 0,%0,0\n\ bne- 2f\n" " stwcx. %1,0,%2\n\ bne- 1b\n" PPC_ACQUIRE_BARRIER "2:" : "=&r" (tmp) - : "r" (token), "r" (&rw->lock) + : "r" (token), "r" (&rw->lock), [eh] "n" (eh) : "cr0", "memory"); return tmp; -- cgit v1.2.3 From eb5a33ea31190c189ca4a59de4687b0877662c06 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Aug 2022 11:02:37 +0200 Subject: powerpc: Don't hide eh field of lwarx behind a macro The eh field must remain 0 for PPC32 and is only used by PPC64. Don't hide that behind a macro, just leave the responsibility to the user. At the time being, the only users of PPC_RAW_L{WDQ}ARX are setting the eh field to 0, so the special handling of __PPC_EH is useless. Just take the value given by the caller. Same for DEFINE_TESTOP(), don't do special handling in that macro, ensure the caller hands over the proper eh value. Signed-off-by: Christophe Leroy [mpe: Use 'n' constraint per Segher] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8b9c8a1a14f9143552a85fcbf96698224a8c2469.1659430931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bitops.h | 4 ++-- arch/powerpc/include/asm/ppc-opcode.h | 11 +---------- 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 344fba3b16eb..7e0f0322912b 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -163,7 +163,7 @@ static inline unsigned long fn( \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ + : "rK" (mask), "r" (p), "n" (eh) \ : "cc", "memory"); \ return (old & mask); \ } @@ -171,7 +171,7 @@ static inline unsigned long fn( \ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) DEFINE_TESTOP(test_and_set_bits_lock, or, "", - PPC_ACQUIRE_BARRIER, 1) + PPC_ACQUIRE_BARRIER, IS_ENABLED(CONFIG_PPC64)) DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 7b81b37a191e..d9703c5fd713 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -343,6 +343,7 @@ #define __PPC_SPR(r) ((((r) & 0x1f) << 16) | ((((r) >> 5) & 0x1f) << 11)) #define __PPC_RC21 (0x1 << 10) #define __PPC_PRFX_R(r) (((r) & 0x1) << 20) +#define __PPC_EH(eh) (((eh) & 0x1) << 0) /* * Both low and high 16 bits are added as SIGNED additions, so if low 16 bits @@ -359,16 +360,6 @@ #define PPC_LI_MASK 0x03fffffc #define PPC_LI(v) ((v) & PPC_LI_MASK) -/* - * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a - * larx with EH set as an illegal instruction. - */ -#ifdef CONFIG_PPC64 -#define __PPC_EH(eh) (((eh) & 0x1) << 0) -#else -#define __PPC_EH(eh) 0 -#endif - /* Base instruction encoding */ #define PPC_RAW_CP_ABORT (0x7c00068c) #define PPC_RAW_COPY(a, b) (PPC_INST_COPY | ___PPC_RA(a) | ___PPC_RB(b)) -- cgit v1.2.3 From 5cccf7a5215d12027e55e247907817631b413c28 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Aug 2022 11:02:38 +0200 Subject: powerpc: Make eh value more explicit when using lwarx Just like the first patch of this series, define a local 'eh' in order to make the code clearer. And IS_ENABLED() returns either 1 or 0 so no need to do IS_ENABLED(CONFIG_PPC64) ? 1 : 0. Signed-off-by: Christophe Leroy [mpe: Use symbolic names, use 'n' constraint per Segher] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/629befaa2d05e2922346e58a383886510d6af55a.1659430931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/atomic.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 853dc86864f4..486ab7889121 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -140,9 +140,10 @@ static __always_inline bool arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) { int r, o = *old; + unsigned int eh = IS_ENABLED(CONFIG_PPC64); __asm__ __volatile__ ( -"1: lwarx %0,0,%2,%5 # atomic_try_cmpxchg_acquire \n" +"1: lwarx %0,0,%2,%[eh] # atomic_try_cmpxchg_acquire \n" " cmpw 0,%0,%3 \n" " bne- 2f \n" " stwcx. %4,0,%2 \n" @@ -150,7 +151,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (r), "+m" (v->counter) - : "r" (&v->counter), "r" (o), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "r" (&v->counter), "r" (o), "r" (new), [eh] "n" (eh) : "cr0", "memory"); if (unlikely(r != o)) -- cgit v1.2.3 From 59bab33a4f57f886c5f8a4d1f2bed728ec185d16 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 3 Aug 2022 13:47:33 +0200 Subject: powerpc/ppc-opcode: Fix PPC_RAW_TW() PPC_RAW_TW() is erroneously defined with base code 0x7f000008 instead of 0x7c000008. That's invisible because its only user is PPC_RAW_TRAP() which is 0x7fe00008, but fix it anyway to avoid any risk of future bug. Fixes: d00d762daf12 ("powerpc/ppc-opcode: Define and use PPC_RAW_TRAP() and PPC_RAW_TW()") Reported-by: Naveen N. Rao Signed-off-by: Christophe Leroy Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/eca9251f1e1f82c4c46ec6380ddb28356ab3fdfe.1659527244.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index d9703c5fd713..c6d724104ed1 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -571,7 +571,7 @@ #define PPC_RAW_BRANCH(offset) (0x48000000 | PPC_LI(offset)) #define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) -#define PPC_RAW_TW(t0, a, b) (0x7f000008 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_TW(t0, a, b) (0x7c000008 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0) #define PPC_RAW_SETB(t, bfa) (0x7c000100 | ___PPC_RT(t) | ___PPC_RA((bfa) << 2)) -- cgit v1.2.3 From 20ec3ebd707c77fb9b11b37193449193d4649f33 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Tue, 16 Aug 2022 20:53:22 +0800 Subject: KVM: Rename mmu_notifier_* to mmu_invalidate_* The motivation of this renaming is to make these variables and related helper functions less mmu_notifier bound and can also be used for non mmu_notifier based page invalidation. mmu_invalidate_* was chosen to better describe the purpose of 'invalidating' a page that those variables are used for. - mmu_notifier_seq/range_start/range_end are renamed to mmu_invalidate_seq/range_start/range_end. - mmu_notifier_retry{_hva} helper functions are renamed to mmu_invalidate_retry{_hva}. - mmu_notifier_count is renamed to mmu_invalidate_in_progress to avoid confusion with mn_active_invalidate_count. - While here, also update kvm_inc/dec_notifier_count() to kvm_mmu_invalidate_begin/end() to match the change for mmu_notifier_count. No functional change intended. Signed-off-by: Chao Peng Message-Id: <20220816125322.1110439-3-chao.p.peng@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/mmu.c | 8 ++--- arch/mips/kvm/mmu.c | 12 +++---- arch/powerpc/include/asm/kvm_book3s_64.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_host.c | 4 +-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++-- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 8 ++--- arch/powerpc/kvm/e500_mmu_host.c | 4 +-- arch/riscv/kvm/mmu.c | 4 +-- arch/x86/kvm/mmu/mmu.c | 14 ++++---- arch/x86/kvm/mmu/paging_tmpl.h | 4 +-- include/linux/kvm_host.h | 60 +++++++++++++++++--------------- virt/kvm/kvm_main.c | 52 ++++++++++++++------------- virt/kvm/pfncache.c | 17 ++++----- 15 files changed, 103 insertions(+), 98 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 87f1cd0df36e..c9a13e487187 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -993,7 +993,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * THP doesn't start to split while we are adjusting the * refcounts. * - * We are sure this doesn't happen, because mmu_notifier_retry + * We are sure this doesn't happen, because mmu_invalidate_retry * was successful and we are holding the mmu_lock, so if this * THP is trying to split, it will be blocked in the mmu * notifier before touching any of the pages, specifically @@ -1188,9 +1188,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return ret; } - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_notifier_seq happens before we call + * Ensure the read of mmu_invalidate_seq happens before we call * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * the page we just got a reference to gets unmapped before we have a * chance to grab the mmu_lock, which ensure that if the page gets @@ -1246,7 +1246,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else write_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; /* diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index db17e870bdff..74cd64a24d05 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -615,17 +615,17 @@ retry: * Used to check for invalidations in progress, of the pfn that is * returned by pfn_to_pfn_prot below. */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in - * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads + * in gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't * risk the page we get a reference to getting unmapped before we have a - * chance to grab the mmu_lock without mmu_notifier_retry() noticing. + * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. * * This smp_rmb() pairs with the effective smp_wmb() of the combination * of the pte_unmap_unlock() after the PTE is zapped, and the * spin_lock() in kvm_mmu_notifier_invalidate_() before - * mmu_notifier_seq is incremented. + * mmu_invalidate_seq is incremented. */ smp_rmb(); @@ -638,7 +638,7 @@ retry: spin_lock(&kvm->mmu_lock); /* Check if an invalidation has taken place since we got pfn */ - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 4def2bd17b9b..d49065af08e9 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -666,7 +666,7 @@ static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq, VM_WARN(!spin_is_locked(&kvm->mmu_lock), "%s called with kvm mmu_lock not held \n", __func__); - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) return NULL; pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 1ae09992c9ea..bc6a381b5346 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -90,7 +90,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, unsigned long pfn; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Get host physical address for gpa */ @@ -151,7 +151,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, cpte = kvmppc_mmu_hpte_cache_next(vcpu); spin_lock(&kvm->mmu_lock); - if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + if (!cpte || mmu_invalidate_retry(kvm, mmu_seq)) { r = -EAGAIN; goto out_unlock; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 514fd45c1994..e9744b41a226 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -578,7 +578,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, return -EFAULT; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); ret = -EFAULT; @@ -693,7 +693,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, /* Check if we might have been invalidated; let the guest retry if so */ ret = RESUME_GUEST; - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) { unlock_rmap(rmap); goto out_unlock; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9d4b3feda3b6..5d5e12f3bf86 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -640,7 +640,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Check if we might have been invalidated; let the guest retry if so */ spin_lock(&kvm->mmu_lock); ret = -EAGAIN; - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; /* Now traverse again under the lock and change the tree */ @@ -830,7 +830,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, bool large_enable; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* @@ -1191,7 +1191,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, * Increase the mmu notifier sequence number to prevent any page * fault that read the memslot earlier from writing a PTE. */ - kvm->mmu_notifier_seq++; + kvm->mmu_invalidate_seq++; spin_unlock(&kvm->mmu_lock); } diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index be8249cc6107..5a64a1341e6f 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -1580,7 +1580,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, /* 2. Find the host pte for this L1 guest real address */ /* Used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* See if can find translation in our partition scoped tables for L1 */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2257fb18cb72..5a05953ae13f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -219,7 +219,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, g_ptel = ptel; /* used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Find the memslot (if any) for this address */ @@ -366,7 +366,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; @@ -932,7 +932,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu, int i; /* Used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); @@ -960,7 +960,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu, long ret = H_SUCCESS; /* Used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 7f16afc331ef..05668e964140 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -339,7 +339,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long flags; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* @@ -460,7 +460,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; goto out; } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 3a35b2d95697..3620ecac2fa1 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -666,7 +666,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return ret; } - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); if (hfn == KVM_PFN_ERR_HWPOISON) { @@ -686,7 +686,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; if (writable) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index eccddb136954..126fa9aec64c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2914,7 +2914,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -2928,7 +2928,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * * There are several ways to safely use this helper: * - * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -3056,7 +3056,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return; /* - * mmu_notifier_retry() was successful and mmu_lock is held, so + * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; @@ -4203,7 +4203,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4227,7 +4227,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); @@ -6055,7 +6055,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6069,7 +6069,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end - gfn_start); - kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f5958071220c..39e0205e7300 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; if (sp->role.direct) @@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fd5c3b4715f8..f4519d3689e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -765,10 +765,10 @@ struct kvm { #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) struct mmu_notifier mmu_notifier; - unsigned long mmu_notifier_seq; - long mmu_notifier_count; - unsigned long mmu_notifier_range_start; - unsigned long mmu_notifier_range_end; + unsigned long mmu_invalidate_seq; + long mmu_invalidate_in_progress; + unsigned long mmu_invalidate_range_start; + unsigned long mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -1357,10 +1357,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end); -void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end); +void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, + unsigned long end); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -1907,42 +1907,44 @@ extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) -static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) +static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { - if (unlikely(kvm->mmu_notifier_count)) + if (unlikely(kvm->mmu_invalidate_in_progress)) return 1; /* - * Ensure the read of mmu_notifier_count happens before the read - * of mmu_notifier_seq. This interacts with the smp_wmb() in - * mmu_notifier_invalidate_range_end to make sure that the caller - * either sees the old (non-zero) value of mmu_notifier_count or - * the new (incremented) value of mmu_notifier_seq. - * PowerPC Book3s HV KVM calls this under a per-page lock - * rather than under kvm->mmu_lock, for scalability, so - * can't rely on kvm->mmu_lock to keep things ordered. + * Ensure the read of mmu_invalidate_in_progress happens before + * the read of mmu_invalidate_seq. This interacts with the + * smp_wmb() in mmu_notifier_invalidate_range_end to make sure + * that the caller either sees the old (non-zero) value of + * mmu_invalidate_in_progress or the new (incremented) value of + * mmu_invalidate_seq. + * + * PowerPC Book3s HV KVM calls this under a per-page lock rather + * than under kvm->mmu_lock, for scalability, so can't rely on + * kvm->mmu_lock to keep things ordered. */ smp_rmb(); - if (kvm->mmu_notifier_seq != mmu_seq) + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } -static inline int mmu_notifier_retry_hva(struct kvm *kvm, - unsigned long mmu_seq, - unsigned long hva) +static inline int mmu_invalidate_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) { lockdep_assert_held(&kvm->mmu_lock); /* - * If mmu_notifier_count is non-zero, then the range maintained by - * kvm_mmu_notifier_invalidate_range_start contains all addresses that - * might be being invalidated. Note that it may include some false + * If mmu_invalidate_in_progress is non-zero, then the range maintained + * by kvm_mmu_notifier_invalidate_range_start contains all addresses + * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ - if (unlikely(kvm->mmu_notifier_count) && - hva >= kvm->mmu_notifier_range_start && - hva < kvm->mmu_notifier_range_end) + if (unlikely(kvm->mmu_invalidate_in_progress) && + hva >= kvm->mmu_invalidate_range_start && + hva < kvm->mmu_invalidate_range_end) return 1; - if (kvm->mmu_notifier_seq != mmu_seq) + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 44b92d773156..5142c054b03c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -702,30 +702,31 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, /* * .change_pte() must be surrounded by .invalidate_range_{start,end}(). - * If mmu_notifier_count is zero, then no in-progress invalidations, - * including this one, found a relevant memslot at start(); rechecking - * memslots here is unnecessary. Note, a false positive (count elevated - * by a different invalidation) is sub-optimal but functionally ok. + * If mmu_invalidate_in_progress is zero, then no in-progress + * invalidations, including this one, found a relevant memslot at + * start(); rechecking memslots here is unnecessary. Note, a false + * positive (count elevated by a different invalidation) is sub-optimal + * but functionally ok. */ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - if (!READ_ONCE(kvm->mmu_notifier_count)) + if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, + unsigned long end) { /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ - kvm->mmu_notifier_count++; - if (likely(kvm->mmu_notifier_count == 1)) { - kvm->mmu_notifier_range_start = start; - kvm->mmu_notifier_range_end = end; + kvm->mmu_invalidate_in_progress++; + if (likely(kvm->mmu_invalidate_in_progress == 1)) { + kvm->mmu_invalidate_range_start = start; + kvm->mmu_invalidate_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has diminishing @@ -736,10 +737,10 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, * accumulate and persist until all outstanding invalidates * complete. */ - kvm->mmu_notifier_range_start = - min(kvm->mmu_notifier_range_start, start); - kvm->mmu_notifier_range_end = - max(kvm->mmu_notifier_range_end, end); + kvm->mmu_invalidate_range_start = + min(kvm->mmu_invalidate_range_start, start); + kvm->mmu_invalidate_range_end = + max(kvm->mmu_invalidate_range_end, end); } } @@ -752,7 +753,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .pte = __pte(0), .handler = kvm_unmap_gfn_range, - .on_lock = kvm_inc_notifier_count, + .on_lock = kvm_mmu_invalidate_begin, .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), @@ -763,7 +764,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, /* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both - * functions. Without that guarantee, the mmu_notifier_count + * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end(). @@ -779,7 +780,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use - * mn_active_invalidate_count (see above) instead of mmu_notifier_count. + * mn_active_invalidate_count (see above) instead of + * mmu_invalidate_in_progress. */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, hva_range.may_block); @@ -789,22 +791,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } -void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, + unsigned long end) { /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ - kvm->mmu_notifier_seq++; + kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above - * in conjunction with the smp_rmb in mmu_notifier_retry(). + * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ - kvm->mmu_notifier_count--; + kvm->mmu_invalidate_in_progress--; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -816,7 +818,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .end = range->end, .pte = __pte(0), .handler = (void *)kvm_null_fn, - .on_lock = kvm_dec_notifier_count, + .on_lock = kvm_mmu_invalidate_end, .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), @@ -837,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); - BUG_ON(kvm->mmu_notifier_count < 0); + BUG_ON(kvm->mmu_invalidate_in_progress < 0); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index ab519f72f2cd..68ff41d39545 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -112,27 +112,28 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s { /* * mn_active_invalidate_count acts for all intents and purposes - * like mmu_notifier_count here; but the latter cannot be used - * here because the invalidation of caches in the mmu_notifier - * event occurs _before_ mmu_notifier_count is elevated. + * like mmu_invalidate_in_progress here; but the latter cannot + * be used here because the invalidation of caches in the + * mmu_notifier event occurs _before_ mmu_invalidate_in_progress + * is elevated. * * Note, it does not matter that mn_active_invalidate_count * is not protected by gpc->lock. It is guaranteed to * be elevated before the mmu_notifier acquires gpc->lock, and - * isn't dropped until after mmu_notifier_seq is updated. + * isn't dropped until after mmu_invalidate_seq is updated. */ if (kvm->mn_active_invalidate_count) return true; /* * Ensure mn_active_invalidate_count is read before - * mmu_notifier_seq. This pairs with the smp_wmb() in + * mmu_invalidate_seq. This pairs with the smp_wmb() in * mmu_notifier_invalidate_range_end() to guarantee either the * old (non-zero) value of mn_active_invalidate_count or the - * new (incremented) value of mmu_notifier_seq is observed. + * new (incremented) value of mmu_invalidate_seq is observed. */ smp_rmb(); - return kvm->mmu_notifier_seq != mmu_seq; + return kvm->mmu_invalidate_seq != mmu_seq; } static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) @@ -155,7 +156,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc->valid = false; do { - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); write_unlock_irq(&gpc->lock); -- cgit v1.2.3 From 8a8f7866663588b162031a5348c24e42161461cd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Aug 2022 19:31:25 +0200 Subject: powerpc/vdso: Don't map VDSO at a fixed address on PPC32 PPC64 removed default mapping address from VDSO in commit 30d0b3682887 ("powerpc: Move 64bit VDSO to improve context switch performance"). Do like PPC64 and let get_unmapped_area() place the VDSO mapping at the address it wants, don't force a default address. This allows randomisation of VDSO address. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cba76f5a5b01fcc49415e632d92c11c1c5998cab.1660843877.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso.h | 3 --- arch/powerpc/kernel/vdso.c | 13 ++----------- 2 files changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 8542e9bbeead..7650b6ce14c8 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -2,9 +2,6 @@ #ifndef _ASM_POWERPC_VDSO_H #define _ASM_POWERPC_VDSO_H -/* Default map addresses for 32bit vDSO */ -#define VDSO32_MBASE 0x100000 - #define VDSO_VERSION_STRING LINUX_2.6.15 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 0da287544054..bf9574ec26ce 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -200,28 +200,19 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int if (is_32bit_task()) { vdso_spec = &vdso32_spec; vdso_size = &vdso32_end - &vdso32_start; - vdso_base = VDSO32_MBASE; } else { vdso_spec = &vdso64_spec; vdso_size = &vdso64_end - &vdso64_start; - /* - * On 64bit we don't have a preferred map address. This - * allows get_unmapped_area to find an area near other mmaps - * and most likely share a SLB entry. - */ - vdso_base = 0; } mappings_size = vdso_size + vvar_size; mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK; /* - * pick a base address for the vDSO in process space. We try to put it - * at vdso_base which is the "natural" base for it, but we might fail - * and end up putting it elsewhere. + * Pick a base address for the vDSO in process space. * Add enough to the size so that the result can be aligned. */ - vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0); + vdso_base = get_unmapped_area(NULL, 0, mappings_size, 0, 0); if (IS_ERR_VALUE(vdso_base)) return vdso_base; -- cgit v1.2.3 From 310d1344e3c58cc2d625aa4e52cfcb7d8a26fcbf Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 23 Aug 2022 21:59:51 +1000 Subject: Revert "powerpc: Remove unused FW_FEATURE_NATIVE references" This reverts commit 79b74a68486765a4fe685ac4069bc71366c538f5. It broke booting on IBM Cell machines when the kernel is also built with CONFIG_PPC_PS3=y. That's because FW_FEATURE_NATIVE_ALWAYS = 0 does have an important effect, which is to clear the PS3 ALWAYS features from FW_FEATURE_ALWAYS. Note that CONFIG_PPC_NATIVE has since been renamed CONFIG_PPC_HASH_MMU_NATIVE. Fixes: 79b74a684867 ("powerpc: Remove unused FW_FEATURE_NATIVE references") Cc: stable@vger.kernel.org # v5.17+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220823115952.1203106-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/firmware.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 398e0b5e485f..ed6db13a1d7c 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -83,6 +83,8 @@ enum { FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, + FW_FEATURE_NATIVE_POSSIBLE = 0, + FW_FEATURE_NATIVE_ALWAYS = 0, FW_FEATURE_POSSIBLE = #ifdef CONFIG_PPC_PSERIES FW_FEATURE_PSERIES_POSSIBLE | @@ -92,6 +94,9 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_POSSIBLE | +#endif +#ifdef CONFIG_PPC_HASH_MMU_NATIVE + FW_FEATURE_NATIVE_ALWAYS | #endif 0, FW_FEATURE_ALWAYS = @@ -103,6 +108,9 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_ALWAYS & +#endif +#ifdef CONFIG_PPC_HASH_MMU_NATIVE + FW_FEATURE_NATIVE_ALWAYS & #endif FW_FEATURE_POSSIBLE, -- cgit v1.2.3 From eb316ae798b36b280ef9e6a79d3aa34d146aa0e4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Aug 2022 15:06:59 +1000 Subject: powerpc: Move patch sites out of asm-prototypes.h The definitions for the patch sites etc. don't belong in asm-prototypes.h, they are not EXPORT'ed asm symbols. Move them into sections.h which is traditionally used for asm symbols. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220818050659.187181-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/asm-prototypes.h | 13 ------------- arch/powerpc/include/asm/sections.h | 13 +++++++++++++ arch/powerpc/kernel/security.c | 1 + 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 81631e64dbeb..e1b3e90eec94 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -55,19 +55,6 @@ struct kvm_vcpu; void _kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); -/* Patch sites */ -extern s32 patch__call_flush_branch_caches1; -extern s32 patch__call_flush_branch_caches2; -extern s32 patch__call_flush_branch_caches3; -extern s32 patch__flush_count_cache_return; -extern s32 patch__flush_link_stack_return; -extern s32 patch__call_kvm_flush_link_stack; -extern s32 patch__call_kvm_flush_link_stack_p9; -extern s32 patch__memset_nocache, patch__memcpy_nocache; - -extern long flush_branch_caches; -extern long kvm_flush_link_stack; - #ifdef CONFIG_PPC_TRANSACTIONAL_MEM void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 8be2c491c733..183e6b8af392 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -14,6 +14,19 @@ typedef struct func_desc func_desc_t; extern char __head_end[]; +/* Patch sites */ +extern s32 patch__call_flush_branch_caches1; +extern s32 patch__call_flush_branch_caches2; +extern s32 patch__call_flush_branch_caches3; +extern s32 patch__flush_count_cache_return; +extern s32 patch__flush_link_stack_return; +extern s32 patch__call_kvm_flush_link_stack; +extern s32 patch__call_kvm_flush_link_stack_p9; +extern s32 patch__memset_nocache, patch__memcpy_nocache; + +extern long flush_branch_caches; +extern long kvm_flush_link_stack; + #ifdef __powerpc64__ extern char __start_interrupts[]; diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index d96fd14bd7c9..b562a1d2c750 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From e38cd72c17fa7d7710088365251feb6c52b501c8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 19 Aug 2022 16:23:43 +0200 Subject: powerpc: Remove stale declarations in mmu_decl.h rtas_size and rtas_data are not used anymore since at least commit 7c8c6b9776fb ("powerpc: Merge lmb.c and make MM initialization use it.") Remove them. Since commit 4b74a35fc7e9 ("powerpc/32s: Make Hash var static") the forward declaration of struct hash_pte is unneeded. Remove it. __initial_memory_limit_addr was removed by commit e63075a3c937 ("memblock: Introduce default allocation limit and use it to replace explicit ones") Remove the declaration. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a821e8397dd56b8177ecc04966d3b3a7c4bda6d4.1660919016.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/setup.h | 1 - arch/powerpc/mm/mmu_decl.h | 4 ---- 2 files changed, 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index d8c28902cf59..dd461b2c825c 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -7,7 +7,6 @@ #ifndef __ASSEMBLY__ extern void ppc_printk_progress(char *s, unsigned short hex); -extern unsigned int rtas_data; extern unsigned long long memory_limit; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 229c72e49198..8f5afad1b6af 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -92,15 +92,11 @@ extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); -extern unsigned int rtas_data, rtas_size; - -struct hash_pte; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ extern unsigned long __max_low_memory; -extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; extern phys_addr_t total_lowmem; extern phys_addr_t memstart_addr; -- cgit v1.2.3 From 395cac7752b905318ae454a8b859d4c190485510 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 17 Aug 2022 15:06:39 +1000 Subject: powerpc/mm: Support execute-only memory on the Radix MMU Add support for execute-only memory (XOM) for the Radix MMU by using an execute-only mapping, as opposed to the RX mapping used by powerpc's other MMUs. The Hash MMU already supports XOM through the execute-only pkey, which is a separate mechanism shared with x86. A PROT_EXEC-only mapping will map to RX, and then the pkey will be applied on top of it. mmap() and mprotect() consumers in userspace should observe the same behaviour on Hash and Radix despite the differences in implementation. Replacing the vma_is_accessible() check in access_error() with a read check should be functionally equivalent for non-Radix MMUs, since it follows write and execute checks. For Radix, the change enables detecting faults on execute-only mappings where vma_is_accessible() would return true. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220817050640.406017-1-ruscur@russell.cc --- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 ++ arch/powerpc/mm/book3s64/pgtable.c | 11 +++++++++-- arch/powerpc/mm/fault.c | 6 +++++- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 392ff48f77df..486902aff040 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -151,6 +151,8 @@ #define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +/* Radix only, Hash uses PAGE_READONLY_X + execute-only pkey instead */ +#define PAGE_EXECONLY __pgprot(_PAGE_BASE | _PAGE_EXEC) /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7b9966402b25..f6151a589298 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -553,8 +553,15 @@ EXPORT_SYMBOL_GPL(memremap_compat_align); pgprot_t vm_get_page_prot(unsigned long vm_flags) { - unsigned long prot = pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) { + prot = pgprot_val(PAGE_EXECONLY); + } else { + prot = pgprot_val(protection_map[vm_flags & + (VM_ACCESS_FLAGS | VM_SHARED)]); + } if (vm_flags & VM_SAO) prot |= _PAGE_SAO; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 014005428687..1566804e4b3d 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -270,7 +270,11 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma return false; } - if (unlikely(!vma_is_accessible(vma))) + /* + * Check for a read fault. This could be caused by a read on an + * inaccessible page (i.e. PROT_NONE), or a Radix MMU execute-only page. + */ + if (unlikely(!(vma->vm_flags & VM_READ))) return true; /* * We should ideally do the vma pkey access check here. But in the -- cgit v1.2.3 From 814816d71e29934d0a76ee259b54c0b80c3b0e4a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 23 Aug 2022 18:36:35 +0200 Subject: powerpc: Fix hard_irq_disable() with sanitizer As reported by Zhouyi Zhou, WRITE_ONCE() is not atomic as expected when KASAN or KCSAN are compiled in. Fix it by re-implementing it using inline assembly. Fixes: 077fc62b2b66 ("powerpc/irq: remove inline assembly in hard_irq_disable macro") Reported-by: Zhouyi Zhou Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a8298991b3df049a54ee8e558838e34265812014.1661272586.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 26ede09c521d..3c8cb48f88ae 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -282,7 +282,8 @@ static inline bool pmi_irq_pending(void) flags = irq_soft_mask_set_return(IRQS_ALL_DISABLED); \ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; \ if (!arch_irqs_disabled_flags(flags)) { \ - WRITE_ONCE(local_paca->saved_r1, current_stack_pointer);\ + asm volatile("std%X0 %1,%0" : "=m" (local_paca->saved_r1) \ + : "r" (current_stack_pointer)); \ trace_hardirqs_off(); \ } \ } while(0) -- cgit v1.2.3 From 684c68d92e2e1b97fa2f31c35c1b0f7671a8618a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 31 Aug 2022 23:10:52 +1000 Subject: Revert "powerpc/irq: Don't open code irq_soft_mask helpers" This reverts commit ef5b570d3700fbb8628a58da0487486ceeb713cd. Zhouyi reported that commit is causing crashes when running rcutorture with KASAN enabled: BUG: using smp_processor_id() in preemptible [00000000] code: rcu_torture_rea/100 caller is rcu_preempt_deferred_qs_irqrestore+0x74/0xed0 CPU: 4 PID: 100 Comm: rcu_torture_rea Tainted: G W 5.19.0-rc5-next-20220708-dirty #253 Call Trace: dump_stack_lvl+0xbc/0x108 (unreliable) check_preemption_disabled+0x154/0x160 rcu_preempt_deferred_qs_irqrestore+0x74/0xed0 __rcu_read_unlock+0x290/0x3b0 rcu_torture_read_unlock+0x30/0xb0 rcutorture_one_extend+0x198/0x810 rcu_torture_one_read+0x58c/0xc90 rcu_torture_reader+0x12c/0x360 kthread+0x1e8/0x220 ret_from_kernel_thread+0x5c/0x64 KASAN will generate instrumentation instructions around the WRITE_ONCE(local_paca->irq_soft_mask, mask): 0xc000000000295cb0 <+0>: addis r2,r12,774 0xc000000000295cb4 <+4>: addi r2,r2,16464 0xc000000000295cb8 <+8>: mflr r0 0xc000000000295cbc <+12>: bl 0xc00000000008bb4c 0xc000000000295cc0 <+16>: mflr r0 0xc000000000295cc4 <+20>: std r31,-8(r1) 0xc000000000295cc8 <+24>: addi r3,r13,2354 0xc000000000295ccc <+28>: mr r31,r13 0xc000000000295cd0 <+32>: std r0,16(r1) 0xc000000000295cd4 <+36>: stdu r1,-48(r1) 0xc000000000295cd8 <+40>: bl 0xc000000000609b98 <__asan_store1+8> 0xc000000000295cdc <+44>: nop 0xc000000000295ce0 <+48>: li r9,1 0xc000000000295ce4 <+52>: stb r9,2354(r31) 0xc000000000295ce8 <+56>: addi r1,r1,48 0xc000000000295cec <+60>: ld r0,16(r1) 0xc000000000295cf0 <+64>: ld r31,-8(r1) 0xc000000000295cf4 <+68>: mtlr r0 If there is a context switch before "stb r9,2354(r31)", r31 may not equal to r13, in such case, irq soft mask will not work. The usual solution of marking the code ineligible for instrumentation forces the code out-of-line, which we would prefer to avoid. Christophe proposed a partial revert, but Nick raised some concerns with that. So for now do a full revert. Reported-by: Zhouyi Zhou [mpe: Construct change log based on Zhouyi's original report] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220831131052.42250-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/hw_irq.h | 43 ++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 3c8cb48f88ae..983551859891 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -113,7 +113,14 @@ static inline void __hard_RI_enable(void) static inline notrace unsigned long irq_soft_mask_return(void) { - return READ_ONCE(local_paca->irq_soft_mask); + unsigned long flags; + + asm volatile( + "lbz %0,%1(13)" + : "=r" (flags) + : "i" (offsetof(struct paca_struct, irq_soft_mask))); + + return flags; } /* @@ -140,24 +147,46 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) WARN_ON(mask && !(mask & IRQS_DISABLED)); - WRITE_ONCE(local_paca->irq_soft_mask, mask); - barrier(); + asm volatile( + "stb %0,%1(13)" + : + : "r" (mask), + "i" (offsetof(struct paca_struct, irq_soft_mask)) + : "memory"); } static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) { - unsigned long flags = irq_soft_mask_return(); + unsigned long flags; - irq_soft_mask_set(mask); +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON(mask && !(mask & IRQS_DISABLED)); +#endif + + asm volatile( + "lbz %0,%1(13); stb %2,%1(13)" + : "=&r" (flags) + : "i" (offsetof(struct paca_struct, irq_soft_mask)), + "r" (mask) + : "memory"); return flags; } static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) { - unsigned long flags = irq_soft_mask_return(); + unsigned long flags, tmp; + + asm volatile( + "lbz %0,%2(13); or %1,%0,%3; stb %1,%2(13)" + : "=&r" (flags), "=r" (tmp) + : "i" (offsetof(struct paca_struct, irq_soft_mask)), + "r" (mask) + : "memory"); - irq_soft_mask_set(flags | mask); +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON((mask | flags) && !((mask | flags) & IRQS_DISABLED)); +#endif return flags; } -- cgit v1.2.3 From a8933c8d55c37f4d5eb617b4bdb72b8888b88681 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Sep 2022 18:53:13 +1000 Subject: powerpc/pseries: Add wait interval counter definitions to struct lppaca The hypervisor exposes accumulated partition scheduling interval times in the VPA (lppaca). These can be used to implement a simple stolen time in the guest without complex and costly dtl scanning. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20220902085316.2071519-2-npiggin@gmail.com --- arch/powerpc/include/asm/lppaca.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index c390ec377bae..34d44cb17c87 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -104,14 +104,18 @@ struct lppaca { volatile __be32 dispersion_count; /* dispatch changed physical cpu */ volatile __be64 cmo_faults; /* CMO page fault count */ volatile __be64 cmo_fault_time; /* CMO page fault time */ - u8 reserved10[104]; + u8 reserved10[64]; /* [S]PURR expropriated/donated */ + volatile __be64 enqueue_dispatch_tb; /* Total TB enqueue->dispatch */ + volatile __be64 ready_enqueue_tb; /* Total TB ready->enqueue */ + volatile __be64 wait_ready_tb; /* Total TB wait->ready */ + u8 reserved11[16]; /* cacheline 4-5 */ __be32 page_ins; /* CMO Hint - # page ins by OS */ - u8 reserved11[148]; + u8 reserved12[148]; volatile __be64 dtl_idx; /* Dispatch Trace Log head index */ - u8 reserved12[96]; + u8 reserved13[96]; } ____cacheline_aligned; #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr) -- cgit v1.2.3 From 0e8a63132800dd8ae5fcb19113f79bea43ea18d9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Sep 2022 18:53:14 +1000 Subject: powerpc/pseries: Implement CONFIG_PARAVIRT_TIME_ACCOUNTING CONFIG_VIRT_CPU_ACCOUNTING_GEN under pseries does not provide stolen time accounting unless CONFIG_PARAVIRT_TIME_ACCOUNTING is enabled. Implement this using the VPA accumulated wait counters. Note this will not work on current KVM hosts because KVM does not implement the VPA dispatch counters (yet). It could be implemented with the dispatch trace log as it is for VIRT_CPU_ACCOUNTING_NATIVE, but that is not necessary for the more limited accounting provided by PARAVIRT_TIME_ACCOUNTING, and it is more expensive, complex, and has downsides like potential log wrap. From Shrikanth: [...] it was tested on Power10 [PowerVM] Shared LPAR. system has two LPAR. we will call first one LPAR1 and second one as LPAR2. Test was carried out in SMT=1. Similar observation was seen in SMT=8 as well. LPAR config header from each LPAR is below. LPAR1 is twice as big as LPAR2. Since Both are sharing the same underlying hardware, work stealing will happen when both the LPAR's are contending for the same resource. LPAR1: type=Shared mode=Uncapped smt=Off lcpu=40 cpus=40 ent=20.00 LPAR2: type=Shared mode=Uncapped smt=Off lcpu=20 cpus=40 ent=10.00 mpstat was used to check for the utilization. stress-ng has been used as the workload. Few cases are tested. when the both LPAR are idle there is no steal time. when LPAR1 starts running at 100% which consumes all of the physical resource, steal time starts to get accounted. With LPAR1 running at 100% and LPAR2 starts running, steal time starts increasing. This is as expected. When the LPAR2 Load is increased further, steal time increases further. Case 1: 0% LPAR1; 0% LPAR2 %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 0.00 0.00 0.05 0.00 0.00 0.00 0.00 0.00 0.00 99.95 Case 2: 100% LPAR1; 0% LPAR2 %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 97.68 0.00 0.00 0.00 0.00 0.00 2.32 0.00 0.00 0.00 Case 3: 100% LPAR1; 50% LPAR2 %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 86.34 0.00 0.10 0.00 0.00 0.03 13.54 0.00 0.00 0.00 Case 4: 100% LPAR1; 100% LPAR2 %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 78.54 0.00 0.07 0.00 0.00 0.02 21.36 0.00 0.00 0.00 Case 5: 50% LPAR1; 100% LPAR2 %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 49.37 0.00 0.00 0.00 0.00 0.00 1.17 0.00 0.00 49.47 Patch is accounting for the steal time and basic tests are holding good. Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde [mpe: Add SPDX tag to new paravirt_api_clock.h] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220902085316.2071519-3-npiggin@gmail.com --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- arch/powerpc/include/asm/paravirt.h | 12 ++++++++++++ arch/powerpc/include/asm/paravirt_api_clock.h | 2 ++ arch/powerpc/platforms/pseries/Kconfig | 8 ++++++++ arch/powerpc/platforms/pseries/lpar.c | 11 +++++++++++ arch/powerpc/platforms/pseries/setup.c | 19 +++++++++++++++++++ 6 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/include/asm/paravirt_api_clock.h (limited to 'arch/powerpc/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d7f30902fda0..d62fdd0ac497 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3741,9 +3741,9 @@ [X86,PV_OPS] Disable paravirtualized VMware scheduler clock and use the default one. - no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time - accounting. steal time is computed, but won't - influence scheduler behaviour + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized + steal time accounting. steal time is computed, but + won't influence scheduler behaviour nolapic [X86-32,APIC] Do not enable or use the local APIC. diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index eb7df559ae74..f5ba1a3c41f8 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -21,6 +21,18 @@ static inline bool is_shared_processor(void) return static_branch_unlikely(&shared_processor); } +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; + +u64 pseries_paravirt_steal_clock(int cpu); + +static inline u64 paravirt_steal_clock(int cpu) +{ + return pseries_paravirt_steal_clock(cpu); +} +#endif + /* If bit 0 is set, the cpu has been ceded, conferred, or preempted */ static inline u32 yield_count_of(int cpu) { diff --git a/arch/powerpc/include/asm/paravirt_api_clock.h b/arch/powerpc/include/asm/paravirt_api_clock.h new file mode 100644 index 000000000000..d25ca7ac57c7 --- /dev/null +++ b/arch/powerpc/include/asm/paravirt_api_clock.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index fb6499977f99..a3b4d99567cb 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -23,13 +23,21 @@ config PPC_PSERIES select SWIOTLB default y +config PARAVIRT + bool + config PARAVIRT_SPINLOCKS bool +config PARAVIRT_TIME_ACCOUNTING + select PARAVIRT + bool + config PPC_SPLPAR bool "Support for shared-processor logical partitions" depends on PPC_PSERIES select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS + select PARAVIRT_TIME_ACCOUNTING if VIRT_CPU_ACCOUNTING_GEN default y help Enabling this option will make the kernel run more efficiently diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index e6c117fb6491..97ef6499e501 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -660,6 +660,17 @@ static int __init vcpudispatch_stats_procfs_init(void) } machine_device_initcall(pseries, vcpudispatch_stats_procfs_init); + +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +u64 pseries_paravirt_steal_clock(int cpu) +{ + struct lppaca *lppaca = &lppaca_of(cpu); + + return be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) + + be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb)); +} +#endif + #endif /* CONFIG_PPC_SPLPAR */ void vpa_init(int cpu) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 489f4c4df468..5e44c65a032c 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -80,6 +80,20 @@ DEFINE_STATIC_KEY_FALSE(shared_processor); EXPORT_SYMBOL(shared_processor); +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); +#endif + int CMO_PrPSP = -1; int CMO_SecPSP = -1; unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K); @@ -834,6 +848,11 @@ static void __init pSeries_setup_arch(void) if (lppaca_shared_proc(get_lppaca())) { static_branch_enable(&shared_processor); pv_spinlocks_init(); +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); +#endif } ppc_md.power_save = pseries_lpar_idle; -- cgit v1.2.3 From 6ba5aa541aaa079c0ca888f7fe564b2035d5ca0a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Sep 2022 18:53:16 +1000 Subject: powerpc/pseries: Move dtl scanning and steal time accounting to pseries platform dtl is the PAPR Dispatch Trace Log, which is entirely a pseries feature. The pseries platform alrady has a file dealing with the dtl, so move scanning for stolen time accounting there from kernel/time.c. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220902085316.2071519-5-npiggin@gmail.com --- arch/powerpc/include/asm/cputime.h | 2 +- arch/powerpc/include/asm/dtl.h | 8 ---- arch/powerpc/include/asm/time.h | 5 +- arch/powerpc/kernel/time.c | 92 ++---------------------------------- arch/powerpc/platforms/pseries/dtl.c | 81 +++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 98 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 6d2b27997492..431ae2343022 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -95,7 +95,7 @@ static notrace inline void account_stolen_time(void) struct lppaca *lp = local_paca->lppaca_ptr; if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx))) - accumulate_stolen_time(); + pseries_accumulate_stolen_time(); } #endif } diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h index 1625888f27ef..4bcb9f9ac764 100644 --- a/arch/powerpc/include/asm/dtl.h +++ b/arch/powerpc/include/asm/dtl.h @@ -37,14 +37,6 @@ struct dtl_entry { extern struct kmem_cache *dtl_cache; extern rwlock_t dtl_access_lock; -/* - * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls - * reading from the dispatch trace log. If other code wants to consume - * DTL entries, it can set this pointer to a function that will get - * called once for each DTL entry that gets processed. - */ -extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); - extern void register_dtl_buffer(int cpu); extern void alloc_dtl_buffers(unsigned long *time_limit); extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1e5643a9b1f2..9f50766c4623 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -116,8 +116,9 @@ unsigned long long tb_to_ns(unsigned long long tb_ticks); void timer_broadcast_interrupt(void); -/* SPLPAR */ -void accumulate_stolen_time(void); +/* SPLPAR and VIRT_CPU_ACCOUNTING_NATIVE */ +void pseries_accumulate_stolen_time(void); +u64 pseries_calculate_stolen_time(u64 stop_tb); #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 587adcc12860..ae3e33b4ef95 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -178,92 +178,6 @@ static inline unsigned long read_spurr(unsigned long tb) return tb; } -#ifdef CONFIG_PPC_SPLPAR - -#include - -void (*dtl_consumer)(struct dtl_entry *, u64); - -/* - * Scan the dispatch trace log and count up the stolen time. - * Should be called with interrupts disabled. - */ -static u64 scan_dispatch_log(u64 stop_tb) -{ - u64 i = local_paca->dtl_ridx; - struct dtl_entry *dtl = local_paca->dtl_curr; - struct dtl_entry *dtl_end = local_paca->dispatch_log_end; - struct lppaca *vpa = local_paca->lppaca_ptr; - u64 tb_delta; - u64 stolen = 0; - u64 dtb; - - if (!dtl) - return 0; - - if (i == be64_to_cpu(vpa->dtl_idx)) - return 0; - while (i < be64_to_cpu(vpa->dtl_idx)) { - dtb = be64_to_cpu(dtl->timebase); - tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + - be32_to_cpu(dtl->ready_to_enqueue_time); - barrier(); - if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { - /* buffer has overflowed */ - i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; - dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); - continue; - } - if (dtb > stop_tb) - break; - if (dtl_consumer) - dtl_consumer(dtl, i); - stolen += tb_delta; - ++i; - ++dtl; - if (dtl == dtl_end) - dtl = local_paca->dispatch_log; - } - local_paca->dtl_ridx = i; - local_paca->dtl_curr = dtl; - return stolen; -} - -/* - * Accumulate stolen time by scanning the dispatch trace log. - * Called on entry from user mode. - */ -void notrace accumulate_stolen_time(void) -{ - u64 sst, ust; - struct cpu_accounting_data *acct = &local_paca->accounting; - - sst = scan_dispatch_log(acct->starttime_user); - ust = scan_dispatch_log(acct->starttime); - acct->stime -= sst; - acct->utime -= ust; - acct->steal_time += ust + sst; -} - -static inline u64 calculate_stolen_time(u64 stop_tb) -{ - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return 0; - - if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) - return scan_dispatch_log(stop_tb); - - return 0; -} - -#else /* CONFIG_PPC_SPLPAR */ -static inline u64 calculate_stolen_time(u64 stop_tb) -{ - return 0; -} - -#endif /* CONFIG_PPC_SPLPAR */ - /* * Account time for a transition between system, hard irq * or soft irq state. @@ -322,7 +236,11 @@ static unsigned long vtime_delta(struct cpu_accounting_data *acct, *stime_scaled = vtime_delta_scaled(acct, now, stime); - *steal_time = calculate_stolen_time(now); + if (IS_ENABLED(CONFIG_PPC_SPLPAR) && + firmware_has_feature(FW_FEATURE_SPLPAR)) + *steal_time = pseries_calculate_stolen_time(now); + else + *steal_time = 0; return stime; } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 352af5b14a0f..1b1977bc78e7 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -37,6 +37,15 @@ static u8 dtl_event_mask = DTL_LOG_ALL; static int dtl_buf_entries = N_DISPATCH_LOG; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +static void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + struct dtl_ring { u64 write_index; struct dtl_entry *write_ptr; @@ -48,6 +57,78 @@ static DEFINE_PER_CPU(struct dtl_ring, dtl_rings); static atomic_t dtl_count; +/* + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. + */ +static notrace u64 scan_dispatch_log(u64 stop_tb) +{ + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; + + if (i == be64_to_cpu(vpa->dtl_idx)) + return 0; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; + if (dtl_consumer) + dtl_consumer(dtl, i); + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; +} + +/* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void notrace pseries_accumulate_stolen_time(void) +{ + u64 sst, ust; + struct cpu_accounting_data *acct = &local_paca->accounting; + + sst = scan_dispatch_log(acct->starttime_user); + ust = scan_dispatch_log(acct->starttime); + acct->stime -= sst; + acct->utime -= ust; + acct->steal_time += ust + sst; +} + +u64 pseries_calculate_stolen_time(u64 stop_tb) +{ + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) + return scan_dispatch_log(stop_tb); + + return 0; +} + /* * The cpu accounting code controls the DTL ring buffer, and we get * given entries as they are processed. -- cgit v1.2.3 From 78c73c80fd860d5b3471d8eaa2778a105a56f6ab Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 08:12:54 +0200 Subject: powerpc/math-emu: Inhibit W=1 warnings When building with W=1 you get: arch/powerpc/math-emu/fre.c:6:5: error: no previous prototype for 'fre' [-Werror=missing-prototypes] arch/powerpc/math-emu/fsqrt.c:11:1: error: no previous prototype for 'fsqrt' [-Werror=missing-prototypes] arch/powerpc/math-emu/fsqrts.c:12:1: error: no previous prototype for 'fsqrts' [-Werror=missing-prototypes] arch/powerpc/math-emu/frsqrtes.c:6:5: error: no previous prototype for 'frsqrtes' [-Werror=missing-prototypes] arch/powerpc/math-emu/mtfsf.c:10:1: error: no previous prototype for 'mtfsf' [-Werror=missing-prototypes] arch/powerpc/math-emu/mtfsfi.c:10:1: error: no previous prototype for 'mtfsfi' [-Werror=missing-prototypes] arch/powerpc/math-emu/fabs.c:7:1: error: no previous prototype for 'fabs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fadd.c:11:1: error: no previous prototype for 'fadd' [-Werror=missing-prototypes] arch/powerpc/math-emu/fadds.c:12:1: error: no previous prototype for 'fadds' [-Werror=missing-prototypes] arch/powerpc/math-emu/fcmpo.c:11:1: error: no previous prototype for 'fcmpo' [-Werror=missing-prototypes] arch/powerpc/math-emu/fcmpu.c:11:1: error: no previous prototype for 'fcmpu' [-Werror=missing-prototypes] arch/powerpc/math-emu/fcmpu.c:14:19: error: variable 'B_c' set but not used [-Werror=unused-but-set-variable] arch/powerpc/math-emu/fcmpu.c:13:19: error: variable 'A_c' set but not used [-Werror=unused-but-set-variable] arch/powerpc/math-emu/fctiw.c:11:1: error: no previous prototype for 'fctiw' [-Werror=missing-prototypes] arch/powerpc/math-emu/fctiwz.c:11:1: error: no previous prototype for 'fctiwz' [-Werror=missing-prototypes] arch/powerpc/math-emu/fdiv.c:11:1: error: no previous prototype for 'fdiv' [-Werror=missing-prototypes] arch/powerpc/math-emu/fdivs.c:12:1: error: no previous prototype for 'fdivs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmadd.c:11:1: error: no previous prototype for 'fmadd' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmadds.c:12:1: error: no previous prototype for 'fmadds' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmsub.c:11:1: error: no previous prototype for 'fmsub' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmsubs.c:12:1: error: no previous prototype for 'fmsubs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmul.c:11:1: error: no previous prototype for 'fmul' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmuls.c:12:1: error: no previous prototype for 'fmuls' [-Werror=missing-prototypes] arch/powerpc/math-emu/fnabs.c:7:1: error: no previous prototype for 'fnabs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fneg.c:7:1: error: no previous prototype for 'fneg' [-Werror=missing-prototypes] arch/powerpc/math-emu/fnmadd.c:11:1: error: no previous prototype for 'fnmadd' [-Werror=missing-prototypes] arch/powerpc/math-emu/fnmadds.c:12:1: error: no previous prototype for 'fnmadds' [-Werror=missing-prototypes] arch/powerpc/math-emu/fnmsub.c:11:1: error: no previous prototype for 'fnmsub' [-Werror=missing-prototypes] arch/powerpc/math-emu/fnmsubs.c:12:1: error: no previous prototype for 'fnmsubs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fres.c:7:1: error: no previous prototype for 'fres' [-Werror=missing-prototypes] arch/powerpc/math-emu/frsp.c:12:1: error: no previous prototype for 'frsp' [-Werror=missing-prototypes] arch/powerpc/math-emu/fsel.c:11:1: error: no previous prototype for 'fsel' [-Werror=missing-prototypes] arch/powerpc/math-emu/lfs.c:12:1: error: no previous prototype for 'lfs' [-Werror=missing-prototypes] arch/powerpc/math-emu/frsqrte.c:7:1: error: no previous prototype for 'frsqrte' [-Werror=missing-prototypes] arch/powerpc/math-emu/fsub.c:11:1: error: no previous prototype for 'fsub' [-Werror=missing-prototypes] arch/powerpc/math-emu/fsubs.c:12:1: error: no previous prototype for 'fsubs' [-Werror=missing-prototypes] arch/powerpc/math-emu/mcrfs.c:10:1: error: no previous prototype for 'mcrfs' [-Werror=missing-prototypes] arch/powerpc/math-emu/mffs.c:10:1: error: no previous prototype for 'mffs' [-Werror=missing-prototypes] arch/powerpc/math-emu/mtfsb0.c:10:1: error: no previous prototype for 'mtfsb0' [-Werror=missing-prototypes] arch/powerpc/math-emu/mtfsb1.c:10:1: error: no previous prototype for 'mtfsb1' [-Werror=missing-prototypes] arch/powerpc/math-emu/stfiwx.c:7:1: error: no previous prototype for 'stfiwx' [-Werror=missing-prototypes] arch/powerpc/math-emu/stfs.c:12:1: error: no previous prototype for 'stfs' [-Werror=missing-prototypes] arch/powerpc/math-emu/fmr.c:7:1: error: no previous prototype for 'fmr' [-Werror=missing-prototypes] arch/powerpc/math-emu/lfd.c:10:1: error: no previous prototype for 'lfd' [-Werror=missing-prototypes] arch/powerpc/math-emu/stfd.c:7:1: error: no previous prototype for 'stfd' [-Werror=missing-prototypes] arch/powerpc/math-emu/math_efp.c:177:5: error: no previous prototype for 'do_spe_mathemu' [-Werror=missing-prototypes] arch/powerpc/math-emu/math_efp.c:726:5: error: no previous prototype for 'speround_handler' [-Werror=missing-prototypes] arch/powerpc/math-emu/math_efp.c:893:12: error: no previous prototype for 'spe_mathemu_init' [-Werror=missing-prototypes] Fix the warnings in math_efp.c by adding prototypes of do_spe_mathemu() and speround_handler() to asm/processor.h and declare spe_mathemu_init() static. The other warnings are benign and not worth the churn of fixing them, expecially the 'unused-but-set-variable' which would impact the core part of 'math-emu'. So silence them by adding -Wno-missing-prototypes -Wno-unused-but-set-variable. But then you get: arch/powerpc/math-emu/fre.c:6:5: error: no previous declaration for 'fre' [-Werror=missing-declarations] arch/powerpc/math-emu/fsqrt.c:11:1: error: no previous declaration for 'fsqrt' [-Werror=missing-declarations] arch/powerpc/math-emu/fsqrts.c:12:1: error: no previous declaration for 'fsqrts' [-Werror=missing-declarations] arch/powerpc/math-emu/frsqrtes.c:6:5: error: no previous declaration for 'frsqrtes' [-Werror=missing-declarations] arch/powerpc/math-emu/mtfsf.c:10:1: error: no previous declaration for 'mtfsf' [-Werror=missing-declarations] arch/powerpc/math-emu/mtfsfi.c:10:1: error: no previous declaration for 'mtfsfi' [-Werror=missing-declarations] arch/powerpc/math-emu/fabs.c:7:1: error: no previous declaration for 'fabs' [-Werror=missing-declarations] arch/powerpc/math-emu/fadd.c:11:1: error: no previous declaration for 'fadd' [-Werror=missing-declarations] arch/powerpc/math-emu/fadds.c:12:1: error: no previous declaration for 'fadds' [-Werror=missing-declarations] arch/powerpc/math-emu/fcmpo.c:11:1: error: no previous declaration for 'fcmpo' [-Werror=missing-declarations] arch/powerpc/math-emu/fcmpu.c:11:1: error: no previous declaration for 'fcmpu' [-Werror=missing-declarations] arch/powerpc/math-emu/fctiw.c:11:1: error: no previous declaration for 'fctiw' [-Werror=missing-declarations] arch/powerpc/math-emu/fctiwz.c:11:1: error: no previous declaration for 'fctiwz' [-Werror=missing-declarations] arch/powerpc/math-emu/fdiv.c:11:1: error: no previous declaration for 'fdiv' [-Werror=missing-declarations] arch/powerpc/math-emu/fdivs.c:12:1: error: no previous declaration for 'fdivs' [-Werror=missing-declarations] arch/powerpc/math-emu/fmadd.c:11:1: error: no previous declaration for 'fmadd' [-Werror=missing-declarations] arch/powerpc/math-emu/fmadds.c:12:1: error: no previous declaration for 'fmadds' [-Werror=missing-declarations] arch/powerpc/math-emu/fmsub.c:11:1: error: no previous declaration for 'fmsub' [-Werror=missing-declarations] arch/powerpc/math-emu/fmsubs.c:12:1: error: no previous declaration for 'fmsubs' [-Werror=missing-declarations] arch/powerpc/math-emu/fmul.c:11:1: error: no previous declaration for 'fmul' [-Werror=missing-declarations] arch/powerpc/math-emu/fmuls.c:12:1: error: no previous declaration for 'fmuls' [-Werror=missing-declarations] arch/powerpc/math-emu/fnabs.c:7:1: error: no previous declaration for 'fnabs' [-Werror=missing-declarations] arch/powerpc/math-emu/fneg.c:7:1: error: no previous declaration for 'fneg' [-Werror=missing-declarations] arch/powerpc/math-emu/fnmadd.c:11:1: error: no previous declaration for 'fnmadd' [-Werror=missing-declarations] arch/powerpc/math-emu/fnmadds.c:12:1: error: no previous declaration for 'fnmadds' [-Werror=missing-declarations] arch/powerpc/math-emu/fnmsub.c:11:1: error: no previous declaration for 'fnmsub' [-Werror=missing-declarations] arch/powerpc/math-emu/fnmsubs.c:12:1: error: no previous declaration for 'fnmsubs' [-Werror=missing-declarations] arch/powerpc/math-emu/fres.c:7:1: error: no previous declaration for 'fres' [-Werror=missing-declarations] arch/powerpc/math-emu/frsp.c:12:1: error: no previous declaration for 'frsp' [-Werror=missing-declarations] arch/powerpc/math-emu/fsel.c:11:1: error: no previous declaration for 'fsel' [-Werror=missing-declarations] arch/powerpc/math-emu/lfs.c:12:1: error: no previous declaration for 'lfs' [-Werror=missing-declarations] arch/powerpc/math-emu/frsqrte.c:7:1: error: no previous declaration for 'frsqrte' [-Werror=missing-declarations] arch/powerpc/math-emu/fsub.c:11:1: error: no previous declaration for 'fsub' [-Werror=missing-declarations] arch/powerpc/math-emu/fsubs.c:12:1: error: no previous declaration for 'fsubs' [-Werror=missing-declarations] arch/powerpc/math-emu/mcrfs.c:10:1: error: no previous declaration for 'mcrfs' [-Werror=missing-declarations] arch/powerpc/math-emu/mffs.c:10:1: error: no previous declaration for 'mffs' [-Werror=missing-declarations] arch/powerpc/math-emu/mtfsb0.c:10:1: error: no previous declaration for 'mtfsb0' [-Werror=missing-declarations] arch/powerpc/math-emu/mtfsb1.c:10:1: error: no previous declaration for 'mtfsb1' [-Werror=missing-declarations] arch/powerpc/math-emu/stfiwx.c:7:1: error: no previous declaration for 'stfiwx' [-Werror=missing-declarations] arch/powerpc/math-emu/stfs.c:12:1: error: no previous declaration for 'stfs' [-Werror=missing-declarations] arch/powerpc/math-emu/fmr.c:7:1: error: no previous declaration for 'fmr' [-Werror=missing-declarations] arch/powerpc/math-emu/lfd.c:10:1: error: no previous declaration for 'lfd' [-Werror=missing-declarations] arch/powerpc/math-emu/stfd.c:7:1: error: no previous declaration for 'stfd' [-Werror=missing-declarations] So also add -Wno-missing-declarations. Reported-by: kernel test robot Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/688084b40b5ac88f2905cb207d5dad947d8d34dc.1662531153.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/processor.h | 2 ++ arch/powerpc/kernel/traps.c | 2 -- arch/powerpc/math-emu/Makefile | 7 +++++++ arch/powerpc/math-emu/math_efp.c | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fdfaae194ddd..97a77b37daa3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -426,6 +426,8 @@ extern int fix_alignment(struct pt_regs *); #endif int do_mathemu(struct pt_regs *regs); +int do_spe_mathemu(struct pt_regs *regs); +int speround_handler(struct pt_regs *regs); /* VMX copying */ int enter_vmx_usercopy(void); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dadfcef5d6db..dcf4046f8565 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2103,7 +2103,6 @@ DEFINE_INTERRUPT_HANDLER(CacheLockingException) #ifdef CONFIG_SPE DEFINE_INTERRUPT_HANDLER(SPEFloatingPointException) { - extern int do_spe_mathemu(struct pt_regs *regs); unsigned long spefscr; int fpexc_mode; int code = FPE_FLTUNK; @@ -2153,7 +2152,6 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointException) DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) { - extern int speround_handler(struct pt_regs *regs); int err; interrupt_cond_local_irq_enable(regs); diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile index 26fef2e5672e..603e59c3db10 100644 --- a/arch/powerpc/math-emu/Makefile +++ b/arch/powerpc/math-emu/Makefile @@ -16,3 +16,10 @@ obj-$(CONFIG_SPE) += math_efp.o CFLAGS_fabs.o = -fno-builtin-fabs CFLAGS_math.o = -fno-builtin-fabs + +ccflags-remove-y = -Wmissing-prototypes -Wmissing-declarations -Wunused-but-set-variable + +ifdef KBUILD_EXTRA_WARN +CFLAGS_math.o += -Wmissing-prototypes -Wmissing-declarations -Wunused-but-set-variable +CFLAGS_math_efp.o += -Wmissing-prototypes -Wmissing-declarations -Wunused-but-set-variable +endif diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c index f01e3475f689..34f62aafe706 100644 --- a/arch/powerpc/math-emu/math_efp.c +++ b/arch/powerpc/math-emu/math_efp.c @@ -890,7 +890,7 @@ int speround_handler(struct pt_regs *regs) return 0; } -int __init spe_mathemu_init(void) +static int __init spe_mathemu_init(void) { u32 pvr, maj, min; -- cgit v1.2.3 From b11931e9adc1a439eab75c97ca4b9f15754cdcb1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 1 Sep 2022 21:03:34 +1000 Subject: powerpc/64s: add pte_needs_flush and huge_pmd_needs_flush Allow PTE changes to avoid flushing the TLB when access permissions are being relaxed, the dirty bit is being set, and the accessed bit is being changed. Relaxing access permissions and setting dirty and accessed bits do not require a flush because the MMU will re-load the PTE and notice the updates (it may also cause a spurious fault). Clearing the accessed bit does not require a flush because of the imprecise PTE accessed bit accounting that is already performed, as documented in ptep_clear_flush_young(). This reduces TLB flushing for some mprotect(2) calls. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Link: https://lore.kernel.org/r/20220901110334.1618913-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 56 +++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 486902aff040..b7fd9b69e828 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -413,6 +413,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] + * + * Note: this optimisation also exists in pte_needs_flush() and + * huge_pmd_needs_flush(). */ #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young ptep_test_and_clear_young diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 206f920fe5b9..67655cd60545 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -163,6 +163,62 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, */ } +static inline bool __pte_flags_need_flush(unsigned long oldval, + unsigned long newval) +{ + unsigned long delta = oldval ^ newval; + + /* + * The return value of this function doesn't matter for hash, + * ptep_modify_prot_start() does a pte_update() which does or schedules + * any necessary hash table update and flush. + */ + if (!radix_enabled()) + return true; + + /* + * We do not expect kernel mappings or non-PTEs or not-present PTEs. + */ + VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE)); + VM_WARN_ON_ONCE(!(newval & _PAGE_PTE)); + VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT)); + VM_WARN_ON_ONCE(!(newval & _PAGE_PRESENT)); + + /* + * Must flush on any change except READ, WRITE, EXEC, DIRTY, ACCESSED. + * + * In theory, some changed software bits could be tolerated, in + * practice those should rarely if ever matter. + */ + + if (delta & ~(_PAGE_RWX | _PAGE_DIRTY | _PAGE_ACCESSED)) + return true; + + /* + * If any of the above was present in old but cleared in new, flush. + * With the exception of _PAGE_ACCESSED, don't worry about flushing + * if that was cleared (see the comment in ptep_clear_flush_young()). + */ + if ((delta & ~_PAGE_ACCESSED) & oldval) + return true; + + return false; +} + +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + return __pte_flags_need_flush(pte_val(oldpte), pte_val(newpte)); +} +#define pte_needs_flush pte_needs_flush + +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + return __pte_flags_need_flush(pmd_val(oldpmd), pmd_val(newpmd)); +} +#define huge_pmd_needs_flush huge_pmd_needs_flush + extern bool tlbie_capable; extern bool tlbie_enabled; -- cgit v1.2.3 From c9874d3ffeaf8ee215187692ed918b3031d996d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Aug 2018 11:47:53 -0400 Subject: termios: start unifying non-UAPI parts of asm/termios.h * new header (linut/termios_internal.h), pulled by the users of those suckers * defaults for INIT_C_CC and externs for conversion helpers moved over there * remove termios-base.h (empty now) Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDmptU7dNGZ+/Hn@ZenIV Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/asm/termios.h | 8 +------- arch/alpha/kernel/termios.c | 3 +-- arch/ia64/include/asm/termios.h | 16 ---------------- arch/mips/include/asm/termios.h | 9 --------- arch/parisc/include/asm/termios.h | 16 ---------------- arch/powerpc/include/asm/termios.h | 2 -- arch/s390/include/asm/termios.h | 14 -------------- arch/sparc/include/asm/termios.h | 8 +------- arch/sparc/kernel/termios.c | 4 ++-- drivers/tty/hvc/hvcs.c | 1 + drivers/tty/tty_io.c | 2 +- drivers/tty/tty_ioctl.c | 1 + drivers/tty/vcc.c | 1 + include/asm-generic/termios-base.h | 21 --------------------- include/asm-generic/termios.h | 20 -------------------- include/linux/termios_internal.h | 30 ++++++++++++++++++++++++++++++ 16 files changed, 39 insertions(+), 117 deletions(-) delete mode 100644 include/asm-generic/termios-base.h create mode 100644 include/linux/termios_internal.h (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/termios.h b/arch/alpha/include/asm/termios.h index bafbb0090024..17b109859e05 100644 --- a/arch/alpha/include/asm/termios.h +++ b/arch/alpha/include/asm/termios.h @@ -2,6 +2,7 @@ #ifndef _ALPHA_TERMIOS_H #define _ALPHA_TERMIOS_H +#include #include /* eof=^D eol=\0 eol2=\0 erase=del @@ -12,11 +13,4 @@ */ #define INIT_C_CC "\004\000\000\177\027\025\022\000\003\034\032\000\021\023\026\025\001\000" -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - #endif /* _ALPHA_TERMIOS_H */ diff --git a/arch/alpha/kernel/termios.c b/arch/alpha/kernel/termios.c index 1534f39cb9fe..a4c29a22edf7 100644 --- a/arch/alpha/kernel/termios.c +++ b/arch/alpha/kernel/termios.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include +#include int user_termio_to_kernel_termios(struct ktermios *termios, struct termio __user *termio) diff --git a/arch/ia64/include/asm/termios.h b/arch/ia64/include/asm/termios.h index e7b2654aeb6f..1cef02701401 100644 --- a/arch/ia64/include/asm/termios.h +++ b/arch/ia64/include/asm/termios.h @@ -10,20 +10,4 @@ #include - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - #endif /* _ASM_IA64_TERMIOS_H */ diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h index 5e8c9d137dee..dbb62330b7a4 100644 --- a/arch/mips/include/asm/termios.h +++ b/arch/mips/include/asm/termios.h @@ -21,13 +21,4 @@ */ #define INIT_C_CC "\003\034\177\025\1\0\0\0\021\023\032\0\022\017\027\026\004\0" -#include - -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - #endif /* _ASM_TERMIOS_H */ diff --git a/arch/parisc/include/asm/termios.h b/arch/parisc/include/asm/termios.h index fe21bad7d2b1..1850a90befb3 100644 --- a/arch/parisc/include/asm/termios.h +++ b/arch/parisc/include/asm/termios.h @@ -4,20 +4,4 @@ #include - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - #endif /* _PARISC_TERMIOS_H */ diff --git a/arch/powerpc/include/asm/termios.h b/arch/powerpc/include/asm/termios.h index 205de8f8a9d3..5c003322fe29 100644 --- a/arch/powerpc/include/asm/termios.h +++ b/arch/powerpc/include/asm/termios.h @@ -13,6 +13,4 @@ /* ^C ^\ del ^U ^D 1 0 0 0 0 ^W ^R ^Z ^Q ^S ^V ^U */ #define INIT_C_CC "\003\034\177\025\004\001\000\000\000\000\027\022\032\021\023\026\025" -#include - #endif /* _ASM_POWERPC_TERMIOS_H */ diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h index 46fa3020b41e..0e26fe97b0d4 100644 --- a/arch/s390/include/asm/termios.h +++ b/arch/s390/include/asm/termios.h @@ -9,18 +9,4 @@ #include - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2)) -#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2)) - -#include - #endif /* _S390_TERMIOS_H */ diff --git a/arch/sparc/include/asm/termios.h b/arch/sparc/include/asm/termios.h index 03bcb6e6abe8..bafd7768f309 100644 --- a/arch/sparc/include/asm/termios.h +++ b/arch/sparc/include/asm/termios.h @@ -3,6 +3,7 @@ #define _SPARC_TERMIOS_H #include +#include /* intr=^C quit=^\ erase=del kill=^U @@ -13,11 +14,4 @@ */ #define INIT_C_CC "\003\034\177\025\004\000\000\000\021\023\032\031\022\025\027\026\001" -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - #endif /* _SPARC_TERMIOS_H */ diff --git a/arch/sparc/kernel/termios.c b/arch/sparc/kernel/termios.c index 97e23d4ae2e2..ee64965c27cd 100644 --- a/arch/sparc/kernel/termios.c +++ b/arch/sparc/kernel/termios.c @@ -1,5 +1,5 @@ -#include -#include +// SPDX-License-Identifier: GPL-2.0 +#include /* * c_cc characters in the termio structure. Oh, how I love being diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c index b79ce8d34f11..4ba24963685e 100644 --- a/drivers/tty/hvc/hvcs.c +++ b/drivers/tty/hvc/hvcs.c @@ -69,6 +69,7 @@ #include #include #include +#include #include /* diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 82a8855981f7..571c94c81477 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -99,8 +99,8 @@ #include #include #include - #include +#include #include #include diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c index 026429276637..ce511557b98b 100644 --- a/drivers/tty/tty_ioctl.c +++ b/drivers/tty/tty_ioctl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "tty.h" #include diff --git a/drivers/tty/vcc.c b/drivers/tty/vcc.c index e11383ae1e7e..34ba6e54789a 100644 --- a/drivers/tty/vcc.c +++ b/drivers/tty/vcc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/include/asm-generic/termios-base.h b/include/asm-generic/termios-base.h deleted file mode 100644 index d6536b2214ae..000000000000 --- a/include/asm-generic/termios-base.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* termios.h: generic termios/termio user copying/translation - */ - -#ifndef _ASM_GENERIC_TERMIOS_BASE_H -#define _ASM_GENERIC_TERMIOS_BASE_H - -#include - -#ifndef __ARCH_TERMIO_GETPUT - -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); - -#endif /* __ARCH_TERMIO_GETPUT */ - -#endif /* _ASM_GENERIC_TERMIOS_BASE_H */ diff --git a/include/asm-generic/termios.h b/include/asm-generic/termios.h index 61b07d9ce8d0..da3b0fe25442 100644 --- a/include/asm-generic/termios.h +++ b/include/asm-generic/termios.h @@ -6,24 +6,4 @@ #include #include -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -#ifdef TCGETS2 -int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); -int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); -int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); -#else /* TCGETS2 */ -int user_termios_to_kernel_termios(struct ktermios *, struct termios __user *); -int kernel_termios_to_user_termios(struct termios __user *, struct ktermios *); -#endif /* TCGETS2 */ - #endif /* _ASM_GENERIC_TERMIOS_H */ diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h new file mode 100644 index 000000000000..103ca0370948 --- /dev/null +++ b/include/linux/termios_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TERMIOS_CONV_H +#define _LINUX_TERMIOS_CONV_H + +#include +#include + +#ifndef INIT_C_CC +/* intr=^C quit=^\ erase=del kill=^U + eof=^D vtime=\0 vmin=\1 sxtc=\0 + start=^Q stop=^S susp=^Z eol=\0 + reprint=^R discard=^U werase=^W lnext=^V + eol2=\0 +*/ +#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" +#endif + +int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); +int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); +#ifdef TCGETS2 +int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); +int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); +int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); +int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); +#else /* TCGETS2 */ +int user_termios_to_kernel_termios(struct ktermios *, struct termios __user *); +int kernel_termios_to_user_termios(struct termios __user *, struct ktermios *); +#endif /* TCGETS2 */ + +#endif /* _LINUX_TERMIOS_CONV_H */ -- cgit v1.2.3 From 38fc315a73f7a1b2d19eb32dd55de089b78b2a54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Aug 2022 17:17:04 -0400 Subject: termios: consolidate values for VDISCARD in INIT_C_CC On old systems it used to be ^O. Linux had never actually used the value, but INIT_C_CC (on i386) did initialize it to ^O; unfortunately, it had a typo in the comment claiming that to be ^U. Most of the architectures copied the (correct) definition along with mistaken comment. alpha, powerpc and sparc tried to make the definition match comment. However, util-linux still resets it to ^O on any architecture, ^O is the historical value, kernel ignores it anyway and finally, Linus said "Just change everybody to do the same, nobody cares about VDISCARD". Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDmy//MKzs3ye7l@ZenIV Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/asm/termios.h | 4 ++-- arch/mips/include/asm/termios.h | 2 +- arch/powerpc/include/asm/termios.h | 4 ++-- arch/sparc/include/asm/termios.h | 4 ++-- include/linux/termios_internal.h | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/termios.h b/arch/alpha/include/asm/termios.h index 17b109859e05..9cc784d0a83c 100644 --- a/arch/alpha/include/asm/termios.h +++ b/arch/alpha/include/asm/termios.h @@ -8,9 +8,9 @@ /* eof=^D eol=\0 eol2=\0 erase=del werase=^W kill=^U reprint=^R sxtc=\0 intr=^C quit=^\ susp=^Z - start=^Q stop=^S lnext=^V discard=^U + start=^Q stop=^S lnext=^V discard=^O vmin=\1 vtime=\0 */ -#define INIT_C_CC "\004\000\000\177\027\025\022\000\003\034\032\000\021\023\026\025\001\000" +#define INIT_C_CC "\004\000\000\177\027\025\022\000\003\034\032\000\021\023\026\017\001\000" #endif /* _ALPHA_TERMIOS_H */ diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h index dbb62330b7a4..059c800afaa1 100644 --- a/arch/mips/include/asm/termios.h +++ b/arch/mips/include/asm/termios.h @@ -16,7 +16,7 @@ * intr=^C quit=^\ erase=del kill=^U * vmin=\1 vtime=\0 eol2=\0 swtc=\0 * start=^Q stop=^S susp=^Z vdsusp= - * reprint=^R discard=^U werase=^W lnext=^V + * reprint=^R discard=^O werase=^W lnext=^V * eof=^D eol=\0 */ #define INIT_C_CC "\003\034\177\025\1\0\0\0\021\023\032\0\022\017\027\026\004\0" diff --git a/arch/powerpc/include/asm/termios.h b/arch/powerpc/include/asm/termios.h index 5c003322fe29..e18a05a6ed34 100644 --- a/arch/powerpc/include/asm/termios.h +++ b/arch/powerpc/include/asm/termios.h @@ -10,7 +10,7 @@ #include -/* ^C ^\ del ^U ^D 1 0 0 0 0 ^W ^R ^Z ^Q ^S ^V ^U */ -#define INIT_C_CC "\003\034\177\025\004\001\000\000\000\000\027\022\032\021\023\026\025" +/* ^C ^\ del ^U ^D 1 0 0 0 0 ^W ^R ^Z ^Q ^S ^V ^O */ +#define INIT_C_CC "\003\034\177\025\004\001\000\000\000\000\027\022\032\021\023\026\017" #endif /* _ASM_POWERPC_TERMIOS_H */ diff --git a/arch/sparc/include/asm/termios.h b/arch/sparc/include/asm/termios.h index bafd7768f309..60f90465fc12 100644 --- a/arch/sparc/include/asm/termios.h +++ b/arch/sparc/include/asm/termios.h @@ -9,9 +9,9 @@ /* intr=^C quit=^\ erase=del kill=^U eof=^D eol=\0 eol2=\0 sxtc=\0 start=^Q stop=^S susp=^Z dsusp=^Y - reprint=^R discard=^U werase=^W lnext=^V + reprint=^R discard=^O werase=^W lnext=^V vmin=\1 vtime=\0 */ -#define INIT_C_CC "\003\034\177\025\004\000\000\000\021\023\032\031\022\025\027\026\001" +#define INIT_C_CC "\003\034\177\025\004\000\000\000\021\023\032\031\022\017\027\026\001" #endif /* _SPARC_TERMIOS_H */ diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h index 103ca0370948..7eb3598c109d 100644 --- a/include/linux/termios_internal.h +++ b/include/linux/termios_internal.h @@ -9,7 +9,7 @@ /* intr=^C quit=^\ erase=del kill=^U eof=^D vtime=\0 vmin=\1 sxtc=\0 start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V + reprint=^R discard=^O werase=^W lnext=^V eol2=\0 */ #define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" -- cgit v1.2.3 From d04f9915fa44b52d7a91080677381a082238e9c4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Sep 2018 17:57:42 -0400 Subject: make generic INIT_C_CC a bit more generic turn it into an array initializer; then alpha, mips and powerpc variants fold into it. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDm7M6M91gC2RPL@ZenIV Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/asm/termios.h | 8 -------- arch/mips/include/asm/termios.h | 9 --------- arch/powerpc/include/asm/termios.h | 3 --- include/linux/termios_internal.h | 15 ++++++++++++++- 4 files changed, 14 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/termios.h b/arch/alpha/include/asm/termios.h index 9cc784d0a83c..3894fd92c508 100644 --- a/arch/alpha/include/asm/termios.h +++ b/arch/alpha/include/asm/termios.h @@ -5,12 +5,4 @@ #include #include -/* eof=^D eol=\0 eol2=\0 erase=del - werase=^W kill=^U reprint=^R sxtc=\0 - intr=^C quit=^\ susp=^Z - start=^Q stop=^S lnext=^V discard=^O - vmin=\1 vtime=\0 -*/ -#define INIT_C_CC "\004\000\000\177\027\025\022\000\003\034\032\000\021\023\026\017\001\000" - #endif /* _ALPHA_TERMIOS_H */ diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h index 059c800afaa1..12bc56857bf1 100644 --- a/arch/mips/include/asm/termios.h +++ b/arch/mips/include/asm/termios.h @@ -12,13 +12,4 @@ #include #include -/* - * intr=^C quit=^\ erase=del kill=^U - * vmin=\1 vtime=\0 eol2=\0 swtc=\0 - * start=^Q stop=^S susp=^Z vdsusp= - * reprint=^R discard=^O werase=^W lnext=^V - * eof=^D eol=\0 - */ -#define INIT_C_CC "\003\034\177\025\1\0\0\0\021\023\032\0\022\017\027\026\004\0" - #endif /* _ASM_TERMIOS_H */ diff --git a/arch/powerpc/include/asm/termios.h b/arch/powerpc/include/asm/termios.h index e18a05a6ed34..83794533f607 100644 --- a/arch/powerpc/include/asm/termios.h +++ b/arch/powerpc/include/asm/termios.h @@ -10,7 +10,4 @@ #include -/* ^C ^\ del ^U ^D 1 0 0 0 0 ^W ^R ^Z ^Q ^S ^V ^O */ -#define INIT_C_CC "\003\034\177\025\004\001\000\000\000\000\027\022\032\021\023\026\017" - #endif /* _ASM_POWERPC_TERMIOS_H */ diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h index 7eb3598c109d..8a53141ab44a 100644 --- a/include/linux/termios_internal.h +++ b/include/linux/termios_internal.h @@ -12,7 +12,20 @@ reprint=^R discard=^O werase=^W lnext=^V eol2=\0 */ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" +#define INIT_C_CC { \ + [VINTR] = 'C'-0x40, \ + [VQUIT] = '\\'-0x40, \ + [VERASE] = '\177', \ + [VKILL] = 'U'-0x40, \ + [VEOF] = 'D'-0x40, \ + [VSTART] = 'Q'-0x40, \ + [VSTOP] = 'S'-0x40, \ + [VSUSP] = 'Z'-0x40, \ + [VREPRINT] = 'R'-0x40, \ + [VDISCARD] = 'O'-0x40, \ + [VWERASE] = 'W'-0x40, \ + [VLNEXT] = 'V'-0x40, \ + [VMIN] = 1 } #endif int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -- cgit v1.2.3 From 89bbeb7e3199e1514729aa6de8057289e6375fe6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Aug 2022 19:41:40 -0400 Subject: termios: get rid of non-UAPI asm/termios.h All non-UAPI asm/termios.h consist of include of UAPI counterpart and, possibly, include of linux/uaccess.h The latter can't be simply removed, even though nothing in linux/termios.h doesn't depend upon it anymore - there are several places that rely upon that indirect chain of includes to pull linux/uaccess.h. So the include needs to be lifted out of there - we lift into tty_driver.h, serdev.h and places that pull asm/termios.h, but none of * linux/uaccess.h (obvious) * net/sock.h (pulls uaccess.h) * linux/{tty,tty_driver,serdev}.h (tty.h pulls tty_driver.h) That leaves us just with the include of UAPI asm/termios.h, which is what will resolve to if we simply remove non-UAPI header. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDnKvYCHn/ogBUv@ZenIV Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/asm/termios.h | 8 -------- arch/arm/mach-ep93xx/core.c | 1 + arch/arm/mach-versatile/integrator_ap.c | 1 + arch/ia64/include/asm/termios.h | 13 ------------- arch/mips/include/asm/termios.h | 15 --------------- arch/parisc/include/asm/termios.h | 7 ------- arch/powerpc/include/asm/termios.h | 13 ------------- arch/s390/include/asm/termios.h | 12 ------------ arch/sparc/include/asm/termios.h | 8 -------- drivers/net/wwan/wwan_core.c | 1 + include/asm-generic/termios.h | 9 --------- include/linux/serdev.h | 1 + include/linux/tty_driver.h | 1 + 13 files changed, 5 insertions(+), 85 deletions(-) delete mode 100644 arch/alpha/include/asm/termios.h delete mode 100644 arch/ia64/include/asm/termios.h delete mode 100644 arch/mips/include/asm/termios.h delete mode 100644 arch/parisc/include/asm/termios.h delete mode 100644 arch/powerpc/include/asm/termios.h delete mode 100644 arch/s390/include/asm/termios.h delete mode 100644 arch/sparc/include/asm/termios.h delete mode 100644 include/asm-generic/termios.h (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/termios.h b/arch/alpha/include/asm/termios.h deleted file mode 100644 index 3894fd92c508..000000000000 --- a/arch/alpha/include/asm/termios.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ALPHA_TERMIOS_H -#define _ALPHA_TERMIOS_H - -#include -#include - -#endif /* _ALPHA_TERMIOS_H */ diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index 2d58e273c96d..95e731676cea 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm/mach-versatile/integrator_ap.c b/arch/arm/mach-versatile/integrator_ap.c index e216fac917d0..4bd6712e9f52 100644 --- a/arch/arm/mach-versatile/integrator_ap.c +++ b/arch/arm/mach-versatile/integrator_ap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/ia64/include/asm/termios.h b/arch/ia64/include/asm/termios.h deleted file mode 100644 index 1cef02701401..000000000000 --- a/arch/ia64/include/asm/termios.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Modified 1999 - * David Mosberger-Tang , Hewlett-Packard Co - * - * 99/01/28 Added N_IRDA and N_SMSBLOCK - */ -#ifndef _ASM_IA64_TERMIOS_H -#define _ASM_IA64_TERMIOS_H - -#include - -#endif /* _ASM_IA64_TERMIOS_H */ diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h deleted file mode 100644 index 12bc56857bf1..000000000000 --- a/arch/mips/include/asm/termios.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1995, 1996, 2000, 2001 by Ralf Baechle - * Copyright (C) 2000, 2001 Silicon Graphics, Inc. - */ -#ifndef _ASM_TERMIOS_H -#define _ASM_TERMIOS_H - -#include -#include - -#endif /* _ASM_TERMIOS_H */ diff --git a/arch/parisc/include/asm/termios.h b/arch/parisc/include/asm/termios.h deleted file mode 100644 index 1850a90befb3..000000000000 --- a/arch/parisc/include/asm/termios.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PARISC_TERMIOS_H -#define _PARISC_TERMIOS_H - -#include - -#endif /* _PARISC_TERMIOS_H */ diff --git a/arch/powerpc/include/asm/termios.h b/arch/powerpc/include/asm/termios.h deleted file mode 100644 index 83794533f607..000000000000 --- a/arch/powerpc/include/asm/termios.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Liberally adapted from alpha/termios.h. In particular, the c_cc[] - * fields have been reordered so that termio & termios share the - * common subset in the same order (for brain dead programs that don't - * know or care about the differences). - */ -#ifndef _ASM_POWERPC_TERMIOS_H -#define _ASM_POWERPC_TERMIOS_H - -#include - -#endif /* _ASM_POWERPC_TERMIOS_H */ diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h deleted file mode 100644 index 0e26fe97b0d4..000000000000 --- a/arch/s390/include/asm/termios.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * - * Derived from "include/asm-i386/termios.h" - */ -#ifndef _S390_TERMIOS_H -#define _S390_TERMIOS_H - -#include - -#endif /* _S390_TERMIOS_H */ diff --git a/arch/sparc/include/asm/termios.h b/arch/sparc/include/asm/termios.h deleted file mode 100644 index 1b85721f4e6b..000000000000 --- a/arch/sparc/include/asm/termios.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SPARC_TERMIOS_H -#define _SPARC_TERMIOS_H - -#include -#include - -#endif /* _SPARC_TERMIOS_H */ diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index b8c7843730ed..62e9f7d6c9fe 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/asm-generic/termios.h b/include/asm-generic/termios.h deleted file mode 100644 index da3b0fe25442..000000000000 --- a/include/asm-generic/termios.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_TERMIOS_H -#define _ASM_GENERIC_TERMIOS_H - - -#include -#include - -#endif /* _ASM_GENERIC_TERMIOS_H */ diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 3368c261ab62..66f624fc618c 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b2456b545ba0..f961164a5274 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 09:44:06 +0800 Subject: kernel: exit: cleanup release_thread() Only x86 has own release_thread(), introduce a new weak release_thread() function to clean empty definitions in other ARCHs. Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Guo Ren [csky] Acked-by: Russell King (Oracle) Acked-by: Geert Uytterhoeven Acked-by: Brian Cain Acked-by: Michael Ellerman [powerpc] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Cc: Alexander Gordeev Cc: Anton Ivanov Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren [csky] Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/processor.h | 2 -- arch/alpha/kernel/process.c | 5 ----- arch/arc/include/asm/processor.h | 3 --- arch/arm/include/asm/processor.h | 3 --- arch/arm/kernel/process.c | 4 ---- arch/arm64/include/asm/processor.h | 3 --- arch/arm64/kernel/process.c | 4 ---- arch/csky/include/asm/processor.h | 5 ----- arch/hexagon/include/asm/processor.h | 4 ---- arch/hexagon/kernel/process.c | 7 ------- arch/ia64/include/asm/processor.h | 7 ------- arch/loongarch/include/asm/processor.h | 3 --- arch/m68k/include/asm/processor.h | 5 ----- arch/microblaze/include/asm/processor.h | 5 ----- arch/mips/include/asm/processor.h | 3 --- arch/nios2/include/asm/processor.h | 5 ----- arch/openrisc/include/asm/processor.h | 1 - arch/openrisc/kernel/process.c | 4 ---- arch/parisc/include/asm/processor.h | 3 --- arch/parisc/kernel/process.c | 4 ---- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/kernel/process.c | 5 ----- arch/riscv/include/asm/processor.h | 5 ----- arch/s390/include/asm/processor.h | 3 --- arch/sh/include/asm/processor_32.h | 3 --- arch/sh/kernel/process_32.c | 5 ----- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/um/include/asm/processor-generic.h | 4 ---- arch/x86/include/asm/processor.h | 3 --- arch/xtensa/include/asm/processor.h | 3 --- include/linux/sched/task.h | 3 +++ kernel/exit.c | 4 ++++ 33 files changed, 7 insertions(+), 118 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 43e234c518b1..714abe494e5f 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -36,8 +36,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* Free all resources held by a thread. */ struct task_struct; -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e2e25f8b5e76..dbf1bc5e2ad2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -225,11 +225,6 @@ flush_thread(void) current_thread_info()->pcb.unique = 0; } -void -release_thread(struct task_struct *dead_task) -{ -} - /* * Copy architecture-specific thread state */ diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 54db9d7bb562..fb844fce1ab6 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -43,9 +43,6 @@ struct task_struct; #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + (void *)task_stack_page(p)) - 1) -/* Free all resources held by a thread */ -#define release_thread(thread) do { } while (0) - /* * A lot of busy-wait loops in SMP are based off of non-volatile data otherwise * get optimised away by gcc diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index bdc35c0e8dfb..326864f79d18 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -81,9 +81,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d9cace63884..712d3e6d9be9 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -232,10 +232,6 @@ void flush_thread(void) thread_notify(THREAD_NOTIFY_FLUSH, thread); } -void release_thread(struct task_struct *dead_task) -{ -} - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 86eb0bfe3b38..4cfb4cd1d475 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -323,9 +323,6 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 92bcc1768f0b..9015f49c206e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -279,10 +279,6 @@ void flush_thread(void) flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9638206bc44f..63ad71fab30d 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -69,11 +69,6 @@ do { \ /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 615f7e49968e..0cd39c2cdf8f 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -60,10 +60,6 @@ struct thread_struct { #define KSTK_EIP(tsk) (pt_elr(task_pt_regs(tsk))) #define KSTK_ESP(tsk) (pt_psp(task_pt_regs(tsk))) -/* Free all resources held by a thread; defined in process.c */ -extern void release_thread(struct task_struct *dead_task); - -/* Get wait channel for task P. */ extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f0552f98a7ba..e15eeaebd785 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -112,13 +112,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } -/* - * Release any architecture-specific resources locked by thread - */ -void release_thread(struct task_struct *dead_task) -{ -} - /* * Some archs flush debug and FPU info here */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 757c2f6d8d4b..d1978e004054 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -318,13 +318,6 @@ struct thread_struct { struct mm_struct; struct task_struct; -/* - * Free all resources held by a thread. This is called after the - * parent of DEAD_TASK has collected the exit status of the task via - * wait(). - */ -#define release_thread(dead_task) - /* Get wait channel for task P. */ extern unsigned long __get_wchan (struct task_struct *p); diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 1c4b4308378d..6954dc5d24e9 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -176,9 +176,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while (0) - enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern unsigned long boot_option_idle_override; diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index d86b4009880b..7a2da780830b 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -145,11 +145,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); void show_registers(struct pt_regs *regs); diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 7e9e92670df3..4e193c7550df 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -63,11 +63,6 @@ struct thread_struct { .pgdir = swapper_pg_dir, \ } -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 4bb24579d12e..3fde1ff72bd1 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -344,9 +344,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - /* * Do necessary setup to start up a newly executed thread. */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index b8125dfbcad2..8916d93d5c2d 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -64,11 +64,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long pc, struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index aa1699c18add..ed9efb430afa 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -72,7 +72,6 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); -void release_thread(struct task_struct *); unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 52dc983ddeba..f94b5ec06786 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -125,10 +125,6 @@ void show_regs(struct pt_regs *regs) show_registers(regs); } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Copy the thread-specific (arch specific) info from the current * process to the new one p diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 4621ceb51314..a608970b249a 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -266,9 +266,6 @@ on downward growing arches, it looks like this: struct mm_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 7c37e09c92da..3db0e97e6c06 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,10 +146,6 @@ void flush_thread(void) */ } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Idle thread support * diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fdfaae194ddd..92e332415d02 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -75,7 +75,6 @@ extern int _chrp_type; struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb..991cda25b9a9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1655,11 +1655,6 @@ EXPORT_SYMBOL_GPL(set_thread_tidr); #endif /* CONFIG_PPC64 */ -void -release_thread(struct task_struct *t) -{ -} - /* * this gets called so that we can store coprocessor state into memory and * copy the current task into the new thread. diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 19eedd4af4cd..94a0590c6971 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -65,11 +65,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e34949..c52fe651eeba 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -186,9 +186,6 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 45240ec6b85a..27aebf1e75a2 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -127,9 +127,6 @@ struct task_struct; extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned long new_sp); -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - /* * FPU lazy state save handling. */ diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a808843375e7..92b6649d4929 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -84,11 +84,6 @@ void flush_thread(void) #endif } -void release_thread(struct task_struct *dead_task) -{ - /* do nothing */ -} - asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b26c35336b51..ba8b70ffec08 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -80,9 +80,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, : "memory"); } -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while(0) - unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 89850dff6b03..2667f35d5ea5 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -176,9 +176,6 @@ do { \ regs->tstate &= ~TSTATE_PEF; \ } while (0) -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while (0) - unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index d0fc1862da95..bb5f06480da9 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -55,10 +55,6 @@ struct thread_struct { .request = { 0 } \ } -static inline void release_thread(struct task_struct *task) -{ -} - /* * User space process size: 3GB (default). */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 356308c73951..67c9d73b31fa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -587,9 +587,6 @@ static inline void load_sp0(unsigned long sp0) #endif /* CONFIG_PARAVIRT_XXL */ -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 76bc63127c66..5abde43c570c 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -221,9 +221,6 @@ struct thread_struct { struct task_struct; struct mm_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 81cab4b01edc..d6c48163c6de 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e..f4b7b058f4e6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -183,6 +183,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; -- cgit v1.2.3 From f88aabad33ea22be2ce1c60d8901942e4e2a9edb Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 7 Sep 2022 17:01:11 -0500 Subject: Revert "powerpc/rtas: Implement reentrant rtas call" At the time this was submitted by Leonardo, I confirmed -- or thought I had confirmed -- with PowerVM partition firmware development that the following RTAS functions: - ibm,get-xive - ibm,int-off - ibm,int-on - ibm,set-xive were safe to call on multiple CPUs simultaneously, not only with respect to themselves as indicated by PAPR, but with arbitrary other RTAS calls: https://lore.kernel.org/linuxppc-dev/875zcy2v8o.fsf@linux.ibm.com/ Recent discussion with firmware development makes it clear that this is not true, and that the code in commit b664db8e3f97 ("powerpc/rtas: Implement reentrant rtas call") is unsafe, likely explaining several strange bugs we've seen in internal testing involving DLPAR and LPM. These scenarios use ibm,configure-connector, whose internal state can be corrupted by the concurrent use of the "reentrant" functions, leading to symptoms like endless busy statuses from RTAS. Fixes: b664db8e3f97 ("powerpc/rtas: Implement reentrant rtas call") Cc: stable@vger.kernel.org # v5.8+ Signed-off-by: Nathan Lynch Reviewed-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220907220111.223267-1-nathanl@linux.ibm.com --- arch/powerpc/include/asm/paca.h | 1 - arch/powerpc/include/asm/rtas.h | 1 - arch/powerpc/kernel/paca.c | 32 ---------------------- arch/powerpc/kernel/rtas.c | 54 ------------------------------------- arch/powerpc/sysdev/xics/ics-rtas.c | 22 +++++++-------- 5 files changed, 11 insertions(+), 99 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 4d7aaab82702..3537b0500f4d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -263,7 +263,6 @@ struct paca_struct { u64 l1d_flush_size; #endif #ifdef CONFIG_PPC_PSERIES - struct rtas_args *rtas_args_reentrant; u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ #endif /* CONFIG_PPC_PSERIES */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 00531af17ce0..56319aea646e 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -240,7 +240,6 @@ extern struct rtas_t rtas; extern int rtas_token(const char *service); extern int rtas_service_present(const char *service); extern int rtas_call(int token, int, int, int *, ...); -int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...); void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...); extern void __noreturn rtas_restart(char *cmd); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index ba593fd60124..dfd097b79160 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "setup.h" @@ -170,30 +169,6 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit) } #endif /* CONFIG_PPC_64S_HASH_MMU */ -#ifdef CONFIG_PPC_PSERIES -/** - * new_rtas_args() - Allocates rtas args - * @cpu: CPU number - * @limit: Memory limit for this allocation - * - * Allocates a struct rtas_args and return it's pointer, - * if not in Hypervisor mode - * - * Return: Pointer to allocated rtas_args - * NULL if CPU in Hypervisor Mode - */ -static struct rtas_args * __init new_rtas_args(int cpu, unsigned long limit) -{ - limit = min_t(unsigned long, limit, RTAS_INSTANTIATE_MAX); - - if (early_cpu_has_feature(CPU_FTR_HVMODE)) - return NULL; - - return alloc_paca_data(sizeof(struct rtas_args), L1_CACHE_BYTES, - limit, cpu); -} -#endif /* CONFIG_PPC_PSERIES */ - /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the * hypervisor and Linux. @@ -232,10 +207,6 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) /* For now -- if we have threads this will be adjusted later */ new_paca->tcd_ptr = &new_paca->tcd; #endif - -#ifdef CONFIG_PPC_PSERIES - new_paca->rtas_args_reentrant = NULL; -#endif } /* Put the paca pointer into r13 and SPRG_PACA */ @@ -307,9 +278,6 @@ void __init allocate_paca(int cpu) #endif #ifdef CONFIG_PPC_64S_HASH_MMU paca->slb_shadow_ptr = new_slb_shadow(cpu, limit); -#endif -#ifdef CONFIG_PPC_PSERIES - paca->rtas_args_reentrant = new_rtas_args(cpu, limit); #endif paca_struct_size += sizeof(struct paca_struct); } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 693133972294..0b8a858aa847 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -43,7 +43,6 @@ #include #include #include -#include /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); @@ -932,59 +931,6 @@ void rtas_activate_firmware(void) pr_err("ibm,activate-firmware failed (%i)\n", fwrc); } -#ifdef CONFIG_PPC_PSERIES -/** - * rtas_call_reentrant() - Used for reentrant rtas calls - * @token: Token for desired reentrant RTAS call - * @nargs: Number of Input Parameters - * @nret: Number of Output Parameters - * @outputs: Array of outputs - * @...: Inputs for desired RTAS call - * - * According to LoPAR documentation, only "ibm,int-on", "ibm,int-off", - * "ibm,get-xive" and "ibm,set-xive" are currently reentrant. - * Reentrant calls need their own rtas_args buffer, so not using rtas.args, but - * PACA one instead. - * - * Return: -1 on error, - * First output value of RTAS call if (nret > 0), - * 0 otherwise, - */ -int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...) -{ - va_list list; - struct rtas_args *args; - unsigned long flags; - int i, ret = 0; - - if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE) - return -1; - - local_irq_save(flags); - preempt_disable(); - - /* We use the per-cpu (PACA) rtas args buffer */ - args = local_paca->rtas_args_reentrant; - - va_start(list, outputs); - va_rtas_call_unlocked(args, token, nargs, nret, list); - va_end(list); - - if (nret > 1 && outputs) - for (i = 0; i < nret - 1; ++i) - outputs[i] = be32_to_cpu(args->rets[i + 1]); - - if (nret > 0) - ret = be32_to_cpu(args->rets[0]); - - local_irq_restore(flags); - preempt_enable(); - - return ret; -} - -#endif /* CONFIG_PPC_PSERIES */ - /** * get_pseries_errorlog() - Find a specific pseries error log in an RTAS * extended event log. diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 9e7007f9aca5..f8320f8e5bc7 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -36,8 +36,8 @@ static void ics_rtas_unmask_irq(struct irq_data *d) server = xics_get_irq_server(d->irq, irq_data_get_affinity_mask(d), 0); - call_status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, hw_irq, - server, DEFAULT_PRIORITY); + call_status = rtas_call(ibm_set_xive, 3, 1, NULL, hw_irq, server, + DEFAULT_PRIORITY); if (call_status != 0) { printk(KERN_ERR "%s: ibm_set_xive irq %u server %x returned %d\n", @@ -46,7 +46,7 @@ static void ics_rtas_unmask_irq(struct irq_data *d) } /* Now unmask the interrupt (often a no-op) */ - call_status = rtas_call_reentrant(ibm_int_on, 1, 1, NULL, hw_irq); + call_status = rtas_call(ibm_int_on, 1, 1, NULL, hw_irq); if (call_status != 0) { printk(KERN_ERR "%s: ibm_int_on irq=%u returned %d\n", __func__, hw_irq, call_status); @@ -68,7 +68,7 @@ static void ics_rtas_mask_real_irq(unsigned int hw_irq) if (hw_irq == XICS_IPI) return; - call_status = rtas_call_reentrant(ibm_int_off, 1, 1, NULL, hw_irq); + call_status = rtas_call(ibm_int_off, 1, 1, NULL, hw_irq); if (call_status != 0) { printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n", __func__, hw_irq, call_status); @@ -76,8 +76,8 @@ static void ics_rtas_mask_real_irq(unsigned int hw_irq) } /* Have to set XIVE to 0xff to be able to remove a slot */ - call_status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, hw_irq, - xics_default_server, 0xff); + call_status = rtas_call(ibm_set_xive, 3, 1, NULL, hw_irq, + xics_default_server, 0xff); if (call_status != 0) { printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n", __func__, hw_irq, call_status); @@ -108,7 +108,7 @@ static int ics_rtas_set_affinity(struct irq_data *d, if (hw_irq == XICS_IPI || hw_irq == XICS_IRQ_SPURIOUS) return -1; - status = rtas_call_reentrant(ibm_get_xive, 1, 3, xics_status, hw_irq); + status = rtas_call(ibm_get_xive, 1, 3, xics_status, hw_irq); if (status) { printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", @@ -126,8 +126,8 @@ static int ics_rtas_set_affinity(struct irq_data *d, pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq, hw_irq, irq_server); - status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, - hw_irq, irq_server, xics_status[1]); + status = rtas_call(ibm_set_xive, 3, 1, NULL, + hw_irq, irq_server, xics_status[1]); if (status) { printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", @@ -158,7 +158,7 @@ static int ics_rtas_check(struct ics *ics, unsigned int hw_irq) return -EINVAL; /* Check if RTAS knows about this interrupt */ - rc = rtas_call_reentrant(ibm_get_xive, 1, 3, status, hw_irq); + rc = rtas_call(ibm_get_xive, 1, 3, status, hw_irq); if (rc) return -ENXIO; @@ -174,7 +174,7 @@ static long ics_rtas_get_server(struct ics *ics, unsigned long vec) { int rc, status[2]; - rc = rtas_call_reentrant(ibm_get_xive, 1, 3, status, vec); + rc = rtas_call(ibm_get_xive, 1, 3, status, vec); if (rc) return -1; return status[0]; -- cgit v1.2.3 From b5a472ad81ba23327ef11ca5b4ba9fd8ed38e45e Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:24 +0800 Subject: powerpc: remove unused udbg_init_debug_beat() declaration udbg_init_debug_beat() has been removed since commit bf4981a00636 ("powerpc: Remove the celleb support"), so remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-5-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/udbg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index b4aa0d88ce2c..524c2085070f 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -42,7 +42,6 @@ extern void __init udbg_init_maple_realmode(void); extern void __init udbg_init_pas_realmode(void); extern void __init udbg_init_rtas_panel(void); extern void __init udbg_init_rtas_console(void); -extern void __init udbg_init_debug_beat(void); extern void __init udbg_init_btext(void); extern void __init udbg_init_44x_as1(void); extern void __init udbg_init_40x_realmode(void); -- cgit v1.2.3 From d24b8f01fe7b848f88ff0a3204a674a092f365d0 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:25 +0800 Subject: powerpc/mm: remove orphan declarations from mmu_context.h Remove the following orphan declarations from mmu_context.h: 1. switch_cop() and drop_cop() have been removed since commit 6ff4d3e96652 ("powerpc: Remove old unused icswx based coprocessor support"). 2. mm_iommu_cleanup() has been removed since commit 4b6fad7097f8 ("powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown"). So remove the declarations for them from header file. Signed-off-by: Gaosheng Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-6-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/mmu_context.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 3f25bd3e14eb..c1ea270bb848 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -31,7 +31,6 @@ extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, extern long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_init(struct mm_struct *mm); -extern void mm_iommu_cleanup(struct mm_struct *mm); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, @@ -117,7 +116,6 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) } #endif -extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); -- cgit v1.2.3 From 77d30535816e90ff6a4466210c403a6b8b42a0a5 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:26 +0800 Subject: powerpc/powernv: remove orphan declarations from opal.h Remove the following orphan declarations from opal.h: 1. opal_notifier_register() 2. opal_notifier_unregister() 3. opal_notifier_update_evt() 4. opal_notifier_enable() 5. opal_notifier_disable() They have been removed since commit 81f2f7ce4c5b ("opal: Remove events notifier"), so remove them. Signed-off-by: Gaosheng Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-7-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/opal.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index bfd3142cd0ba..726125a534de 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -324,16 +324,10 @@ extern int opal_flush_console(uint32_t vtermno); extern void hvc_opal_init_early(void); -extern int opal_notifier_register(struct notifier_block *nb); -extern int opal_notifier_unregister(struct notifier_block *nb); - extern int opal_message_notifier_register(enum opal_msg_type msg_type, struct notifier_block *nb); extern int opal_message_notifier_unregister(enum opal_msg_type msg_type, struct notifier_block *nb); -extern void opal_notifier_enable(void); -extern void opal_notifier_disable(void); -extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); extern int opal_async_get_token_interruptible(void); extern int opal_async_release_token(int token); -- cgit v1.2.3 From 3abed8acfe95046b27117f48d52dccd2ea82a322 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:27 +0800 Subject: powerpc/sysdev: remove unused xics_ipi_dispatch() declaration xics_ipi_dispatch() has been removed since commit 23d72bfd8f9f ("powerpc: Consolidate ipi message mux and demux"), so remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-8-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/xics.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index e2e704eca5f6..89090485bec1 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -159,7 +159,6 @@ extern void xics_setup_cpu(void); extern void xics_update_irq_servers(void); extern void xics_set_cpu_giq(unsigned int gserver, unsigned int join); extern void xics_mask_unknown_vec(unsigned int vec); -extern irqreturn_t xics_ipi_dispatch(int cpu); extern void xics_smp_probe(void); extern void xics_register_ics(struct ics *ics); extern void xics_teardown_cpu(void); -- cgit v1.2.3 From b47f0024f990803f36e6a06f82e9d0dbe8424c26 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:28 +0800 Subject: powerpc/ps3: remove orphan declarations from ps3av.h Remove the following orphan declarations from ps3av.h: 1. ps3av_dev_open() 2. ps3av_dev_close() They have been removed since commit 13a5e30cf740 ("[POWERPC] PS3: Rework AV settings driver"), so remove them. Signed-off-by: Gaosheng Cui Acked-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-9-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/ps3av.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ps3av.h b/arch/powerpc/include/asm/ps3av.h index 82db78fc169d..c8b0f2ffcd35 100644 --- a/arch/powerpc/include/asm/ps3av.h +++ b/arch/powerpc/include/asm/ps3av.h @@ -726,6 +726,4 @@ extern int ps3av_video_mode2res(u32, u32 *, u32 *); extern int ps3av_video_mute(int); extern int ps3av_audio_mute(int); extern int ps3av_audio_mute_analog(int); -extern int ps3av_dev_open(void); -extern int ps3av_dev_close(void); #endif /* _ASM_POWERPC_PS3AV_H_ */ -- cgit v1.2.3 From 3d7a198cfdb47405cfb4a3ea523876569fe341e6 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 13 Sep 2022 15:50:29 +0800 Subject: KVM: PPC: remove orphan declarations from kvm_ppc.h Remove the following orphan declarations from kvm_ppc.h: 1. kvmppc_mmu_priv_switch() has been removed since commit dd9ebf1f9435 ("KVM: PPC: e500: Add shadow PID support"). 2. kvmppc_core_destroy_mmu() has been removed since commit ecc0981ff07c ("KVM: ppc: cosmetic changes to mmu hook names"). 3. kvmppc_prepare_vrma() has been removed since commit aa04b4cc5be6 ("KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests"). So remove the declarations for them from header file. Signed-off-by: Gaosheng Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220913075029.682327-10-cuigaosheng1@huawei.com --- arch/powerpc/include/asm/kvm_ppc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9f625af3b65b..bfacf12784dd 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -104,7 +104,6 @@ extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, unsigned int gtlb_idx); -extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); @@ -153,7 +152,6 @@ extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu); extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); -extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); @@ -162,8 +160,6 @@ extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern void kvmppc_rmap_reset(struct kvm *kvm); -extern long kvmppc_prepare_vrma(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 0c32903197ce9f7119aee75a6bcaa4b49e0cd21a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 17 Sep 2022 16:36:47 +1000 Subject: powerpc/64: Remove unused prom_init_toc symbols Commit 24d33ac5b8ff ("powerpc/64s: Make prom_init require RELOCATABLE") made prom_init depend on CONFIG_RELOCATABLE. But it missed cleaning up a case in the linker script for RELOCATABLE=n, and associated symbols. Remove them now. Fixes: 24d33ac5b8ff ("powerpc/64s: Make prom_init require RELOCATABLE") Reported-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220920131157.1032707-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/sections.h | 3 --- arch/powerpc/kernel/prom_init_check.sh | 3 +-- arch/powerpc/kernel/vmlinux.lds.S | 5 ----- 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 183e6b8af392..babda2677b30 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -32,9 +32,6 @@ extern long kvm_flush_link_stack; extern char __start_interrupts[]; extern char __end_interrupts[]; -extern char __prom_init_toc_start[]; -extern char __prom_init_toc_end[]; - #ifdef CONFIG_PPC_POWERNV extern char start_real_trampolines[]; extern char end_real_trampolines[]; diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index dfa5f729f774..311890d71c4c 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -26,8 +26,7 @@ _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start logo_linux_clut224 btext_prepare_BAT reloc_got2 kernstart_addr memstart_addr linux_banner _stext -__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC. -relocate" +btext_setup_display TOC. relocate" NM="$1" OBJ="$2" diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index fe22d940412f..0f2a10b029e8 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -337,11 +337,6 @@ SECTIONS .got : AT(ADDR(.got) - LOAD_OFFSET) ALIGN(256) { *(.got) -#ifndef CONFIG_RELOCATABLE - __prom_init_toc_start = .; - arch/powerpc/kernel/prom_init.o*(.toc) - __prom_init_toc_end = .; -#endif *(.toc) } #endif -- cgit v1.2.3 From b150a4d12b919baf956b807aa305cf78df03d0fe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 16 Sep 2022 14:41:24 +1000 Subject: powerpc/vmlinux.lds: Add an explicit symbol for the SRWX boundary Currently __init_begin is used as the boundary for strict RWX between executable/read-only text and data, and non-executable (after boot) code and data. But that's a little subtle, so add an explicit symbol to document that the SRWX boundary lies there, and add a comment making it clear that __init_begin must also begin there. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220916131422.318752-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/sections.h | 1 + arch/powerpc/kernel/vmlinux.lds.S | 9 +++++++-- arch/powerpc/mm/book3s32/mmu.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++-- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index babda2677b30..9c00c9c0ca8f 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -13,6 +13,7 @@ typedef struct func_desc func_desc_t; #include extern char __head_end[]; +extern char __srwx_boundary[]; /* Patch sites */ extern s32 patch__call_flush_branch_caches1; diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index dacf8b4302d9..29d891329856 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -214,11 +214,16 @@ SECTIONS } #endif + /* + * Various code relies on __init_begin being at the strict RWX boundary. + */ + . = ALIGN(STRICT_ALIGN_SIZE); + __srwx_boundary = .; + __init_begin = .; + /* * Init sections discarded at runtime */ - . = ALIGN(STRICT_ALIGN_SIZE); - __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index a96b73006dfb..250d174459d4 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -158,7 +158,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long done; - unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET; unsigned long size; size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 698274109c91..9f880f91f584 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -259,8 +259,8 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); #endif return end; } -- cgit v1.2.3 From 51da853e3708852f47cd95e6f5e1821c3d54c3ef Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 3 Sep 2022 22:36:39 +1000 Subject: powerpc/mm/64s: Drop pgd_huge() On powerpc there are two ways for huge pages to be represented in the top level page table, aka PGD (Page Global Directory). If the address space mapped by an individual PGD entry does not correspond to a given huge page size, then the PGD entry points to a non-standard page table, known as a "hugepd" (Huge Page Directory). The hugepd contains some number of huge page PTEs sufficient to map the address space with the given huge page size. On the other hand, if the address space mapped by an individual PGD entry does correspond exactly to a given huge page size, that PGD entry is used to directly encode the huge page PTE in place. In this case the pgd_huge() wrapper indicates to generic code that the PGD entry is actually a huge page PTE. This commit deals with the pgd_huge() case only, it does nothing with respect to the hugepd case. Over time the size of the virtual address space supported on powerpc has increased several times, which means the location at which huge pages can sit in the tree has also changed. There have also been new huge page sizes added, with the introduction of the Radix MMU. On Power9 and later with the Radix MMU, the largest huge page size in any implementation is 1GB. Since the introduction of Radix, 1GB entries have been supported at the PUD level, with both 4K and 64K base page size. Radix has never had a supported huge page size at the PGD level. On Power8 or earlier, which uses the Hash MMU, or Power9 or later with the Hash MMU enabled, the largest huge page size is 16GB. Using the Hash MMU and a base page size of 4K, 16GB has never been a supported huge page size at the PGD level, due to the geometry being incompatible. The two supported huge page sizes (16M & 16GB) both use the hugepd format. Using the Hash MMU and a base page size of 64K, 16GB pages were supported in the past at the PGD level. However in commit ba95b5d03596 ("powerpc/mm/book3s/64: Rework page table geometry for lower memory usage") the page table layout was reworked to shrink the size of the PGD. As a result the 16GB page size now fits at the PUD level when using 64K base page size. Therefore there are no longer any supported configurations where pgd_huge() can be true, so drop the definitions for pgd_huge(), and fallback to the generic definition which is always false. Fixes: ba95b5d03596 ("powerpc/mm/book3s/64: Rework page table geometry for lower memory usage") Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220903123640.719846-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/book3s/64/pgtable-4k.h | 10 ---------- arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 9 --------- 2 files changed, 19 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index 4e697bc2f4cd..48f21820afe2 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -26,16 +26,6 @@ static inline int pud_huge(pud_t pud) return 0; } -static inline int pgd_huge(pgd_t pgd) -{ - /* - * leaf pte for huge page - */ - if (radix_enabled()) - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); - return 0; -} -#define pgd_huge pgd_huge /* * With radix , we have hugepage ptes in the pud and pmd entries. We don't * need to setup hugepage directory for them. Our pte and page directory format diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 34d1018896b3..2fce3498b000 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -30,15 +30,6 @@ static inline int pud_huge(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } -static inline int pgd_huge(pgd_t pgd) -{ - /* - * leaf pte for huge page - */ - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); -} -#define pgd_huge pgd_huge - /* * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't * need to setup hugepage directory for them. Our pte and page directory format -- cgit v1.2.3 From 79c5640ab4460a03535ce0f120193174e7701b65 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 3 Sep 2022 22:36:40 +1000 Subject: powerpc/mm/64s: Drop p4d_leaf() Because 64-bit Book3S uses pgtable-nop4d.h, the P4D is folded into the PGD. So P4D entries are actually PGD entries, or vice versa. The other way to think of it is that the P4D is a single entry page table below the PGD. Zero bits of the address are needed to index into the P4D, therefore a P4D entry maps the same size address space as a PGD entry. As explained in the previous commit, there are no huge page sizes supported directly at the PGD level on 64-bit Book3S, so there are also no huge page sizes supported at the P4D level. Therefore p4d_is_leaf() can never be true, so drop the definition and fallback to the default implementation that always returns false. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220903123640.719846-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/book3s/64/pgtable.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b7fd9b69e828..28fd016cf7ff 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1462,12 +1462,5 @@ static inline bool pud_is_leaf(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } -#define p4d_is_leaf p4d_is_leaf -#define p4d_leaf p4d_is_leaf -static inline bool p4d_is_leaf(p4d_t p4d) -{ - return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE)); -} - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ -- cgit v1.2.3 From a26494cf4aeb8e9888428a43f55cc486f06f1334 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 11:34:44 +0200 Subject: powerpc/nohash: Remove pgd_huge() stub linux/hugetlb.h has a fallback pgd_huge() macro for when pgd_huge is not defined. Remove the powerpc redundant definitions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ae6aa7fce84f7abcbf67f534271a4a6dd7949b0d.1662543243.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pgtable.h | 6 ------ arch/powerpc/include/asm/page.h | 1 - 2 files changed, 7 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index b499da6c1a99..08429c612cdf 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -277,12 +277,6 @@ static inline int pud_huge(pud_t pud) return 0; } -static inline int pgd_huge(pgd_t pgd) -{ - return 0; -} -#define pgd_huge pgd_huge - #define is_hugepd(hpd) (hugepd_ok(hpd)) #endif diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index e5f75c70eda8..c67eb9531a3f 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -311,7 +311,6 @@ static inline bool pfn_valid(unsigned long pfn) #ifndef CONFIG_HUGETLB_PAGE #define is_hugepd(pdep) (0) -#define pgd_huge(pgd) (0) #endif /* CONFIG_HUGETLB_PAGE */ struct page; -- cgit v1.2.3 From 691cdf016d3be6f66a3ea384809be229e0f9c590 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 11:34:45 +0200 Subject: powerpc: Rely on generic definition of hugepd_t and is_hugepd when unused CONFIG_ARCH_HAS_HUGEPD is used to tell core mm when huge page directories are used. When they are not used, no need to provide hugepd_t or is_hugepd(), just rely on the core mm fallback definition. For that, change core mm behaviour so that CONFIG_ARCH_HAS_HUGEPD is used instead of indirect is_hugepd macro existence. powerpc being the only user of huge page directories, there is no impact on other architectures. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/da81462d93069bb90fe5e762dd3283a644318937.1662543243.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/page.h | 5 ----- arch/powerpc/include/asm/pgtable-be-types.h | 2 ++ arch/powerpc/include/asm/pgtable-types.h | 2 ++ include/linux/hugetlb.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index c67eb9531a3f..7f20636d13ed 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -308,11 +308,6 @@ static inline bool pfn_valid(unsigned long pfn) #include #endif - -#ifndef CONFIG_HUGETLB_PAGE -#define is_hugepd(pdep) (0) -#endif /* CONFIG_HUGETLB_PAGE */ - struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index b169bbf95fcb..82633200b500 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -101,6 +101,7 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) return pmd_raw(old) == prev; } +#ifdef CONFIG_ARCH_HAS_HUGEPD typedef struct { __be64 pdbe; } hugepd_t; #define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) }) @@ -108,5 +109,6 @@ static inline unsigned long hpd_val(hugepd_t x) { return be64_to_cpu(x.pdbe); } +#endif #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index efed0db7b1db..082c85cc09b1 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -83,11 +83,13 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) } #endif +#ifdef CONFIG_ARCH_HAS_HUGEPD typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) static inline unsigned long hpd_val(hugepd_t x) { return x.pd; } +#endif #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3..1ec1535be04f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -17,7 +17,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; -#ifndef is_hugepd +#ifndef CONFIG_ARCH_HAS_HUGEPD typedef struct { unsigned long pd; } hugepd_t; #define is_hugepd(hugepd) (0) #define __hugepd(x) ((hugepd_t) { (x) }) -- cgit v1.2.3 From 73ea68ad0d2f655815b6f1fbe1c5521d72f01b64 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 5 Sep 2022 11:38:25 +0200 Subject: powerpc/book3s: Inline first level of update_mmu_cache() update_mmu_cache() voids when hash page tables are not used. On PPC32 that means when MMU_FTR_HPTE_TABLE is not defined. On PPC64 that means when RADIX is enabled. Rename core part of update_mmu_cache() as __update_mmu_cache() and include the initial verification in an inlined caller. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/bea5ad0de7f83eff256116816d46c84fa0a444de.1662370698.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/pgtable.h | 15 ++++++++++----- arch/powerpc/mm/book3s32/mmu.c | 4 +--- arch/powerpc/mm/book3s64/hash_utils.c | 5 +---- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index e8269434ecbe..d18b748ea3ae 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -25,7 +25,8 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define __HAVE_PHYS_MEM_ACCESS_PROT -#if defined(CONFIG_PPC32) || defined(CONFIG_PPC_64S_HASH_MMU) +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); + /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. @@ -35,10 +36,14 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, * corresponding HPTE into the hash table ahead of time, instead of * waiting for the inevitable extra hash-table miss exception. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -#else -static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} -#endif +static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_PPC32) && !mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + if (radix_enabled()) + return; + __update_mmu_cache(vma, address, ptep); +} #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index e5ee05c234c7..850783cfa9c7 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -314,11 +314,9 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 363a9447d63d..ced1107b1677 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1781,7 +1781,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* @@ -1791,9 +1791,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) - return; - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; -- cgit v1.2.3 From b997b2f57cae396448bb62c428efa4b112dd90ed Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 12:05:01 +0200 Subject: powerpc/mm: Reduce redundancy in pgtable.h PAGE_KERNEL_TEXT, PAGE_KERNEL_EXEC and PAGE_AGP are the same for all powerpcs. Remove duplicated definitions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/92254499430d13d99e4a4d7e9ad8e8284cb35380.1662544974.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 19 ------------------- arch/powerpc/include/asm/book3s/64/pgtable.h | 19 ------------------- arch/powerpc/include/asm/nohash/pgtable.h | 19 ------------------- arch/powerpc/include/asm/pgtable.h | 19 +++++++++++++++++++ 4 files changed, 19 insertions(+), 57 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 40041ac713d9..8da6d4544822 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -118,25 +118,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -/* - * Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X - -/* Advertise special mapping type for AGP */ -#define PAGE_AGP (PAGE_KERNEL_NC) -#define HAVE_PAGE_AGP - #define PTE_INDEX_SIZE PTE_SHIFT #define PMD_INDEX_SIZE 0 #define PUD_INDEX_SIZE 0 diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 28fd016cf7ff..3007a251a186 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -164,22 +164,6 @@ #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -/* - * Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X -#define PAGE_AGP (PAGE_KERNEL_NC) - #ifndef __ASSEMBLY__ /* * page table defines @@ -335,9 +319,6 @@ extern unsigned long pci_io_base; #define IOREMAP_END (KERN_IO_END - FIXADDR_SIZE) #define FIXADDR_SIZE SZ_32M -/* Advertise special mapping type for AGP */ -#define HAVE_PAGE_AGP - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 08429c612cdf..18b29cfee0d6 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -17,25 +17,6 @@ #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -/* - * Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X - -/* Advertise special mapping type for AGP */ -#define PAGE_AGP (PAGE_KERNEL_NC) -#define HAVE_PAGE_AGP - #ifndef __ASSEMBLY__ /* Generic accessors to PTE bits */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 33f4bf8d22b0..283f40d05a4d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,6 +20,25 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X + +/* Advertise special mapping type for AGP */ +#define PAGE_AGP (PAGE_KERNEL_NC) +#define HAVE_PAGE_AGP + #ifndef __ASSEMBLY__ #ifndef MAX_PTRS_PER_PGD -- cgit v1.2.3 From 6cc07821adce44e864c3752a3842936a6a7f6aef Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 12:05:21 +0200 Subject: powerpc/mm: Make PAGE_KERNEL_xxx macros grep-friendly Avoid multi-lines to help getting a complete view when using grep. They still remain under the 100 chars limit. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3bc3f5a51949ee7f52dba36677db23d4337c7995.1662544980.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 3 +-- arch/powerpc/include/asm/book3s/64/pgtable.h | 9 +++------ arch/powerpc/include/asm/nohash/pgtable.h | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 8da6d4544822..75823f39e042 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -112,8 +112,7 @@ static inline bool pte_user(pte_t pte) /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE | _PAGE_GUARDED) #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 3007a251a186..b5f8264cc980 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -117,8 +117,7 @@ #define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) #define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_READ | _PAGE_EXEC) -#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ - _PAGE_RW | _PAGE_EXEC) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) /* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes @@ -156,10 +155,8 @@ /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_TOLERANT) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NON_IDEMPOTENT) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_TOLERANT) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NON_IDEMPOTENT) #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 18b29cfee0d6..4fd73c7412d0 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -11,8 +11,7 @@ /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE | _PAGE_GUARDED) #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -- cgit v1.2.3 From 76b719881a26fec3b77652134f19cf1dfcc96318 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:29 +0200 Subject: powerpc/cputable: Move __cpu_setup() prototypes out of cputable.h Move all prototypes out of cputable.h For that rename cpu_setup_power.h to cpu_setup.h and move all prototypes in it. Signed-off-by: Christophe Leroy [mpe: Standardise cpu_spec *spec formatting] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f45118489ee450db654db8bbcdfd8f5907337c22.1663606875.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/cpu_setup.h | 49 ++++++++++++++++++++++++++++++ arch/powerpc/include/asm/cpu_setup_power.h | 12 -------- arch/powerpc/kernel/cpu_setup_power.c | 2 +- arch/powerpc/kernel/cputable.c | 38 +---------------------- 4 files changed, 51 insertions(+), 50 deletions(-) create mode 100644 arch/powerpc/include/asm/cpu_setup.h delete mode 100644 arch/powerpc/include/asm/cpu_setup_power.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/cpu_setup.h b/arch/powerpc/include/asm/cpu_setup.h new file mode 100644 index 000000000000..30e2fe389502 --- /dev/null +++ b/arch/powerpc/include/asm/cpu_setup.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 IBM Corporation + */ + +#ifndef _ASM_POWERPC_CPU_SETUP_H +#define _ASM_POWERPC_CPU_SETUP_H +void __setup_cpu_power7(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_power8(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_power9(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_power10(unsigned long offset, struct cpu_spec *spec); +void __restore_cpu_power7(void); +void __restore_cpu_power8(void); +void __restore_cpu_power9(void); +void __restore_cpu_power10(void); + +void __setup_cpu_e500v1(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_e500v2(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_e500mc(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440ep(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440epx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440gx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440grx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440spe(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_440x5(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_460ex(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_460gt(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_460sx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_apm821xx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_603(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_604(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_750(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_750cx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_750fx(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_7400(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_7410(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_745x(unsigned long offset, struct cpu_spec *spec); + +void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec *spec); +void __restore_cpu_pa6t(void); +void __restore_cpu_ppc970(void); + +void __setup_cpu_e5500(unsigned long offset, struct cpu_spec *spec); +void __setup_cpu_e6500(unsigned long offset, struct cpu_spec *spec); +void __restore_cpu_e5500(void); +void __restore_cpu_e6500(void); +#endif /* _ASM_POWERPC_CPU_SETUP_H */ diff --git a/arch/powerpc/include/asm/cpu_setup_power.h b/arch/powerpc/include/asm/cpu_setup_power.h deleted file mode 100644 index 24be9131f803..000000000000 --- a/arch/powerpc/include/asm/cpu_setup_power.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2020 IBM Corporation - */ -void __setup_cpu_power7(unsigned long offset, struct cpu_spec *spec); -void __restore_cpu_power7(void); -void __setup_cpu_power8(unsigned long offset, struct cpu_spec *spec); -void __restore_cpu_power8(void); -void __setup_cpu_power9(unsigned long offset, struct cpu_spec *spec); -void __restore_cpu_power9(void); -void __setup_cpu_power10(unsigned long offset, struct cpu_spec *spec); -void __restore_cpu_power10(void); diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.c index 3dc61e203f37..097c033668f0 100644 --- a/arch/powerpc/kernel/cpu_setup_power.c +++ b/arch/powerpc/kernel/cpu_setup_power.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* Disable CPU_FTR_HVMODE and return false if MSR:HV is not set */ static bool init_hvmode_206(struct cpu_spec *t) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 5ace97cccad8..9229e0930332 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -18,6 +18,7 @@ #include #include #include +#include static struct cpu_spec the_cpu_spec __read_mostly; @@ -34,43 +35,6 @@ const char *powerpc_base_platform; * part of the cputable though. That has to be fixed for both ppc32 * and ppc64 */ -#ifdef CONFIG_PPC32 -extern void __setup_cpu_e500v1(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e500v2(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e500mc(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440ep(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440epx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440gx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440grx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440spe(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440x5(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460ex(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460gt(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460sx(unsigned long offset, struct cpu_spec *spec); -extern void __setup_cpu_apm821xx(unsigned long offset, struct cpu_spec *spec); -extern void __setup_cpu_603(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_604(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750cx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750fx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_7400(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_7410(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); -#endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -#include -extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_pa6t(void); -extern void __restore_cpu_ppc970(void); -#endif /* CONFIG_PPC64 */ -#if defined(CONFIG_E500) -extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_e5500(void); -extern void __restore_cpu_e6500(void); -#endif /* CONFIG_E500 */ /* This table only contains "desktop" CPUs, it need to be filled with embedded * ones as well... -- cgit v1.2.3 From dfc3095cec27f402c183da920f4733785e4c873d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:31 +0200 Subject: powerpc: Remove CONFIG_FSL_BOOKE PPC_85xx is PPC32 only. PPC_85xx always selects E500 and is the only PPC32 that selects E500. FSL_BOOKE is selected when E500 and PPC32 are selected. So FSL_BOOKE is redundant with PPC_85xx. Remove FSL_BOOKE. And rename four files accordingly. cpu_setup_fsl_booke.S is not renamed because it is linked to PPC_FSL_BOOK3E and not to FSL_BOOKE as suggested by its name. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/08e3e15594e66d63b9e89c5b4f9c35153913c28f.1663606875.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 28 +- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/kexec.h | 2 +- arch/powerpc/include/asm/nohash/32/pgtable.h | 6 +- arch/powerpc/include/asm/nohash/32/pte-85xx.h | 74 ++ arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h | 74 -- arch/powerpc/include/asm/nohash/tlbflush.h | 2 +- arch/powerpc/kernel/85xx_entry_mapping.S | 230 ++++ arch/powerpc/kernel/Makefile | 6 +- arch/powerpc/kernel/fsl_booke_entry_mapping.S | 230 ---- arch/powerpc/kernel/head_85xx.S | 1227 ++++++++++++++++++++ arch/powerpc/kernel/head_fsl_booke.S | 1227 -------------------- arch/powerpc/kernel/kgdb.c | 12 +- arch/powerpc/kernel/swsusp_85xx.S | 202 ++++ arch/powerpc/kernel/swsusp_booke.S | 202 ---- arch/powerpc/kernel/traps.c | 4 +- arch/powerpc/kexec/core_32.c | 2 +- arch/powerpc/kexec/relocate_32.S | 4 +- arch/powerpc/kvm/booke_interrupts.S | 4 +- arch/powerpc/mm/init_32.c | 4 +- arch/powerpc/mm/mmu_decl.h | 4 +- arch/powerpc/mm/nohash/fsl_book3e.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 2 +- arch/powerpc/mm/nohash/tlb_low.S | 2 +- arch/powerpc/platforms/Kconfig.cputype | 11 +- 25 files changed, 1779 insertions(+), 1784 deletions(-) create mode 100644 arch/powerpc/include/asm/nohash/32/pte-85xx.h delete mode 100644 arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h create mode 100644 arch/powerpc/kernel/85xx_entry_mapping.S delete mode 100644 arch/powerpc/kernel/fsl_booke_entry_mapping.S create mode 100644 arch/powerpc/kernel/head_85xx.S delete mode 100644 arch/powerpc/kernel/head_fsl_booke.S create mode 100644 arch/powerpc/kernel/swsusp_85xx.S delete mode 100644 arch/powerpc/kernel/swsusp_booke.S (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 220045692e48..dafb14f44672 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,7 +135,7 @@ config PPC select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION - select ARCH_HAS_STRICT_KERNEL_RWX if FSL_BOOKE && !HIBERNATION && !RANDOMIZE_BASE + select ARCH_HAS_STRICT_KERNEL_RWX if PPC_85xx && !HIBERNATION && !RANDOMIZE_BASE select ARCH_HAS_STRICT_MODULE_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE @@ -548,7 +548,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" - depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) || PPC_BOOK3E + depends on (PPC_BOOK3S || PPC_85xx || (44x && !SMP)) || PPC_BOOK3E select KEXEC_CORE help kexec is a system call that implements the ability to shutdown your @@ -583,7 +583,7 @@ config ARCH_HAS_KEXEC_PURGATORY config RELOCATABLE bool "Build a relocatable kernel" - depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE)) + depends on PPC64 || (FLATMEM && (44x || PPC_85xx)) select NONSTATIC_KERNEL help This builds a kernel image that is capable of running at the @@ -606,7 +606,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" - depends on (FSL_BOOKE && FLATMEM && PPC32) + depends on (PPC_85xx && FLATMEM && PPC32) depends on RELOCATABLE help Randomizes the virtual address at which the kernel image is @@ -625,8 +625,8 @@ config RELOCATABLE_TEST config CRASH_DUMP bool "Build a dump capture kernel" - depends on PPC64 || PPC_BOOK3S_32 || FSL_BOOKE || (44x && !SMP) - select RELOCATABLE if PPC64 || 44x || FSL_BOOKE + depends on PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP) + select RELOCATABLE if PPC64 || 44x || PPC_85xx help Build a kernel suitable for use as a dump capture kernel. The same kernel binary can be used as production kernel and dump @@ -815,7 +815,7 @@ config DATA_SHIFT_BOOL depends on ADVANCED_OPTIONS depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX) || \ - FSL_BOOKE + PPC_85xx help This option allows you to set the kernel data alignment. When RAM is mapped by blocks, the alignment needs to fit the size and @@ -828,13 +828,13 @@ config DATA_SHIFT default 24 if STRICT_KERNEL_RWX && PPC64 range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32 range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx - range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && FSL_BOOKE + range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_85xx default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32 default 23 if STRICT_KERNEL_RWX && PPC_8xx default 23 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx && PIN_TLB_DATA default 19 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx - default 24 if STRICT_KERNEL_RWX && FSL_BOOKE + default 24 if STRICT_KERNEL_RWX && PPC_85xx default PPC_PAGE_SHIFT help On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. @@ -1150,7 +1150,7 @@ config LOWMEM_SIZE config LOWMEM_CAM_NUM_BOOL bool "Set number of CAMs to use to map low memory" - depends on ADVANCED_OPTIONS && FSL_BOOKE + depends on ADVANCED_OPTIONS && PPC_85xx help This option allows you to set the maximum number of CAM slots that will be used to map low memory. There are a limited number of slots @@ -1161,7 +1161,7 @@ config LOWMEM_CAM_NUM_BOOL Say N here unless you know what you are doing. config LOWMEM_CAM_NUM - depends on FSL_BOOKE + depends on PPC_85xx int "Number of CAMs to use to map low memory" if LOWMEM_CAM_NUM_BOOL default 3 if !STRICT_KERNEL_RWX default 9 if DATA_SHIFT >= 24 @@ -1170,7 +1170,7 @@ config LOWMEM_CAM_NUM config DYNAMIC_MEMSTART bool "Enable page aligned dynamic load address for kernel" - depends on ADVANCED_OPTIONS && FLATMEM && (FSL_BOOKE || 44x) + depends on ADVANCED_OPTIONS && FLATMEM && (PPC_85xx || 44x) select NONSTATIC_KERNEL help This option enables the kernel to be loaded at any page aligned @@ -1219,7 +1219,7 @@ config KERNEL_START config PHYSICAL_START_BOOL bool "Set physical address where the kernel is loaded" - depends on ADVANCED_OPTIONS && FLATMEM && FSL_BOOKE + depends on ADVANCED_OPTIONS && FLATMEM && PPC_85xx help This gives the physical address where the kernel is loaded. @@ -1232,7 +1232,7 @@ config PHYSICAL_START config PHYSICAL_ALIGN hex - default "0x04000000" if FSL_BOOKE + default "0x04000000" if PPC_85xx help This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 02742facf895..f6d477c4aa64 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -231,7 +231,7 @@ head-$(CONFIG_PPC_BOOK3S_32) := arch/powerpc/kernel/head_book3s_32.o head-$(CONFIG_PPC_8xx) := arch/powerpc/kernel/head_8xx.o head-$(CONFIG_40x) := arch/powerpc/kernel/head_40x.o head-$(CONFIG_44x) := arch/powerpc/kernel/head_44x.o -head-$(CONFIG_FSL_BOOKE) := arch/powerpc/kernel/head_fsl_booke.o +head-$(CONFIG_PPC_85xx) := arch/powerpc/kernel/head_85xx.o head-$(CONFIG_PPC64) += arch/powerpc/kernel/entry_64.o head-$(CONFIG_PPC_FPU) += arch/powerpc/kernel/fpu.o diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index f8d122d16af4..a1ddba01e7d1 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -3,7 +3,7 @@ #define _ASM_POWERPC_KEXEC_H #ifdef __KERNEL__ -#if defined(CONFIG_FSL_BOOKE) || defined(CONFIG_44x) +#if defined(CONFIG_PPC_85xx) || defined(CONFIG_44x) /* * On FSL-BookE we setup a 1:1 mapping which covers the first 2GiB of memory diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 9091e4904a6b..197e7552d9f6 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -130,10 +130,10 @@ void unmap_kernel_page(unsigned long va); #include #elif defined(CONFIG_44x) #include -#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT) +#elif defined(CONFIG_PPC_85xx) && defined(CONFIG_PTE_64BIT) #include -#elif defined(CONFIG_FSL_BOOKE) -#include +#elif defined(CONFIG_PPC_85xx) +#include #elif defined(CONFIG_PPC_8xx) #include #endif diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h new file mode 100644 index 000000000000..93fb8e11a3f1 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_NOHASH_32_PTE_85xx_H +#define _ASM_POWERPC_NOHASH_32_PTE_85xx_H +#ifdef __KERNEL__ + +/* PTE bit definitions for Freescale BookE SW loaded TLB MMU based + * processors + * + MMU Assist Register 3: + + 32 33 34 35 36 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + RPN...................... 0 0 U0 U1 U2 U3 UX SX UW SW UR SR + + - PRESENT *must* be in the bottom three bits because swap cache + entries use the top 29 bits. + +*/ + +/* Definitions for FSL Book-E Cores */ +#define _PAGE_PRESENT 0x00001 /* S: PTE contains a translation */ +#define _PAGE_USER 0x00002 /* S: User page (maps to UR) */ +#define _PAGE_RW 0x00004 /* S: Write permission (SW) */ +#define _PAGE_DIRTY 0x00008 /* S: Page dirty */ +#define _PAGE_EXEC 0x00010 /* H: SX permission */ +#define _PAGE_ACCESSED 0x00020 /* S: Page referenced */ + +#define _PAGE_ENDIAN 0x00040 /* H: E bit */ +#define _PAGE_GUARDED 0x00080 /* H: G bit */ +#define _PAGE_COHERENT 0x00100 /* H: M bit */ +#define _PAGE_NO_CACHE 0x00200 /* H: I bit */ +#define _PAGE_WRITETHRU 0x00400 /* H: W bit */ +#define _PAGE_SPECIAL 0x00800 /* S: Special page */ + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) + +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +#define _PMD_PRESENT 0 +#define _PMD_PRESENT_MASK (PAGE_MASK) +#define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 + +#define _PTE_NONE_MASK 0 + +#define PTE_WIMGE_SHIFT (6) + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_NOHASH_32_PTE_FSL_85xx_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h deleted file mode 100644 index 0fc1bd42bb3e..000000000000 --- a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H -#define _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H -#ifdef __KERNEL__ - -/* PTE bit definitions for Freescale BookE SW loaded TLB MMU based - * processors - * - MMU Assist Register 3: - - 32 33 34 35 36 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - RPN...................... 0 0 U0 U1 U2 U3 UX SX UW SW UR SR - - - PRESENT *must* be in the bottom three bits because swap cache - entries use the top 29 bits. - -*/ - -/* Definitions for FSL Book-E Cores */ -#define _PAGE_PRESENT 0x00001 /* S: PTE contains a translation */ -#define _PAGE_USER 0x00002 /* S: User page (maps to UR) */ -#define _PAGE_RW 0x00004 /* S: Write permission (SW) */ -#define _PAGE_DIRTY 0x00008 /* S: Page dirty */ -#define _PAGE_EXEC 0x00010 /* H: SX permission */ -#define _PAGE_ACCESSED 0x00020 /* S: Page referenced */ - -#define _PAGE_ENDIAN 0x00040 /* H: E bit */ -#define _PAGE_GUARDED 0x00080 /* H: G bit */ -#define _PAGE_COHERENT 0x00100 /* H: M bit */ -#define _PAGE_NO_CACHE 0x00200 /* H: I bit */ -#define _PAGE_WRITETHRU 0x00400 /* H: W bit */ -#define _PAGE_SPECIAL 0x00800 /* S: Special page */ - -#define _PAGE_KERNEL_RO 0 -#define _PAGE_KERNEL_ROX _PAGE_EXEC -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 - -#define _PMD_PRESENT 0 -#define _PMD_PRESENT_MASK (PAGE_MASK) -#define _PMD_BAD (~PAGE_MASK) -#define _PMD_USER 0 - -#define _PTE_NONE_MASK 0 - -#define PTE_WIMGE_SHIFT (6) - -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) -#else -#define _PAGE_BASE (_PAGE_BASE_NC) -#endif - -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H */ diff --git a/arch/powerpc/include/asm/nohash/tlbflush.h b/arch/powerpc/include/asm/nohash/tlbflush.h index 698935d4f72d..bdaf34ad41ea 100644 --- a/arch/powerpc/include/asm/nohash/tlbflush.h +++ b/arch/powerpc/include/asm/nohash/tlbflush.h @@ -18,7 +18,7 @@ /* * TLB flushing for software loaded TLB chips * - * TODO: (CONFIG_FSL_BOOKE) determine if flush_tlb_range & + * TODO: (CONFIG_PPC_85xx) determine if flush_tlb_range & * flush_tlb_kernel_range are best implemented as tlbia vs * specific tlbie's */ diff --git a/arch/powerpc/kernel/85xx_entry_mapping.S b/arch/powerpc/kernel/85xx_entry_mapping.S new file mode 100644 index 000000000000..dedc17fac8f8 --- /dev/null +++ b/arch/powerpc/kernel/85xx_entry_mapping.S @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* 1. Find the index of the entry we're executing in */ + bcl 20,31,$+4 /* Find our address */ +invstr: mflr r6 /* Make it accessible */ + mfmsr r7 + rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ + mfspr r7, SPRN_PID0 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID0 */ + mfspr r7,SPRN_MAS1 + andis. r7,r7,MAS1_VALID@h + bne match_TLB + + mfspr r7,SPRN_MMUCFG + rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ + cmpwi r7,3 + bne match_TLB /* skip if NPIDS != 3 */ + + mfspr r7,SPRN_PID1 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID1 */ + mfspr r7,SPRN_MAS1 + andis. r7,r7,MAS1_VALID@h + bne match_TLB + mfspr r7, SPRN_PID2 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* Fall through, we had to match */ + +match_TLB: + mfspr r7,SPRN_MAS0 + rlwinm r3,r7,16,20,31 /* Extract MAS0(Entry) */ + + mfspr r7,SPRN_MAS1 /* Insure IPROT set */ + oris r7,r7,MAS1_IPROT@h + mtspr SPRN_MAS1,r7 + tlbwe + +/* 2. Invalidate all entries except the entry we're executing in */ + mfspr r9,SPRN_TLB1CFG + andi. r9,r9,0xfff + li r6,0 /* Set Entry counter to 0 */ +1: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r7,SPRN_MAS1 + rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ + cmpw r3,r6 + beq skpinv /* Dont update the current execution TLB */ + mtspr SPRN_MAS1,r7 + tlbwe + isync +skpinv: addi r6,r6,1 /* Increment */ + cmpw r6,r9 /* Are we done? */ + bne 1b /* If not, repeat */ + + /* Invalidate TLB0 */ + li r6,0x04 + tlbivax 0,r6 + TLBSYNC + /* Invalidate TLB1 */ + li r6,0x0c + tlbivax 0,r6 + TLBSYNC + +/* 3. Setup a temp mapping and jump to it */ + andi. r5, r3, 0x1 /* Find an entry not used and is non-zero */ + addi r5, r5, 0x1 + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r7 + tlbre + + /* grab and fixup the RPN */ + mfspr r6,SPRN_MAS1 /* extract MAS1[SIZE] */ + rlwinm r6,r6,25,27,31 + li r8,-1 + addi r6,r6,10 + slw r6,r8,r6 /* convert to mask */ + + bcl 20,31,$+4 /* Find our address */ +1: mflr r7 + + mfspr r8,SPRN_MAS3 +#ifdef CONFIG_PHYS_64BIT + mfspr r23,SPRN_MAS7 +#endif + and r8,r6,r8 + subfic r9,r6,-4096 + and r9,r9,r7 + + or r25,r8,r9 + ori r8,r25,(MAS3_SX|MAS3_SW|MAS3_SR) + + /* Just modify the entry ID and EPN for the temp mapping */ + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ + mtspr SPRN_MAS0,r7 + xori r6,r4,1 /* Setup TMP mapping in the other Address space */ + slwi r6,r6,12 + oris r6,r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS1,r6 + mfspr r6,SPRN_MAS2 + li r7,0 /* temp EPN = 0 */ + rlwimi r7,r6,0,20,31 + mtspr SPRN_MAS2,r7 + mtspr SPRN_MAS3,r8 + tlbwe + + xori r6,r4,1 + slwi r6,r6,5 /* setup new context with other address space */ + bcl 20,31,$+4 /* Find our address */ +1: mflr r9 + rlwimi r7,r9,0,20,31 + addi r7,r7,(2f - 1b) + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r6 + rfi +2: +/* 4. Clear out PIDs & Search info */ + li r6,0 + mtspr SPRN_MAS6,r6 + mtspr SPRN_PID0,r6 + + mfspr r7,SPRN_MMUCFG + rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ + cmpwi r7,3 + bne 2f /* skip if NPIDS != 3 */ + + mtspr SPRN_PID1,r6 + mtspr SPRN_PID2,r6 + +/* 5. Invalidate mapping we started in */ +2: + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r6,SPRN_MAS1 + rlwinm r6,r6,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r6 + tlbwe + /* Invalidate TLB1 */ + li r9,0x0c + tlbivax 0,r9 + TLBSYNC + +#if defined(ENTRY_MAPPING_BOOT_SETUP) + +/* 6. Setup kernstart_virt_addr mapping in TLB1[0] */ + lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ + mtspr SPRN_MAS0,r6 + lis r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l + mtspr SPRN_MAS1,r6 + lis r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r6,r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r6,r6,r20 + ori r6,r6,MAS2_M_IF_NEEDED@l + mtspr SPRN_MAS2,r6 + mtspr SPRN_MAS3,r8 + tlbwe + +/* 7. Jump to kernstart_virt_addr mapping */ + mr r6,r20 + +#elif defined(ENTRY_MAPPING_KEXEC_SETUP) +/* + * 6. Setup a 1:1 mapping in TLB1. Esel 0 is unsued, 1 or 2 contains the tmp + * mapping so we start at 3. We setup 8 mappings, each 256MiB in size. This + * will cover the first 2GiB of memory. + */ + + lis r10, (MAS1_VALID|MAS1_IPROT)@h + ori r10,r10, (MAS1_TSIZE(BOOK3E_PAGESZ_256M))@l + li r11, 0 + li r0, 8 + mtctr r0 + +next_tlb_setup: + addi r0, r11, 3 + rlwinm r0, r0, 16, 4, 15 // Compute esel + rlwinm r9, r11, 28, 0, 3 // Compute [ER]PN + oris r0, r0, (MAS0_TLBSEL(1))@h + mtspr SPRN_MAS0,r0 + mtspr SPRN_MAS1,r10 + mtspr SPRN_MAS2,r9 + ori r9, r9, (MAS3_SX|MAS3_SW|MAS3_SR) + mtspr SPRN_MAS3,r9 + tlbwe + addi r11, r11, 1 + bdnz+ next_tlb_setup + +/* 7. Jump to our 1:1 mapping */ + mr r6, r25 +#else + #error You need to specify the mapping or not use this at all. +#endif + + lis r7,MSR_KERNEL@h + ori r7,r7,MSR_KERNEL@l + bcl 20,31,$+4 /* Find our address */ +1: mflr r9 + rlwimi r6,r9,0,20,31 + addi r6,r6,(2f - 1b) + mtspr SPRN_SRR0,r6 + mtspr SPRN_SRR1,r7 + rfi /* start execution out of TLB1[0] entry */ + +/* 8. Clear out the temp mapping */ +2: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r8,SPRN_MAS1 + rlwinm r8,r8,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r8 + tlbwe + /* Invalidate TLB1 */ + li r9,0x0c + tlbivax 0,r9 + TLBSYNC diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 06d2d1f78f71..4483cae7dc9f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -106,8 +106,8 @@ endif obj-$(CONFIG_PPC_BOOK3S_32) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o -ifdef CONFIG_FSL_BOOKE -obj-$(CONFIG_HIBERNATION) += swsusp_booke.o +ifdef CONFIG_PPC_85xx +obj-$(CONFIG_HIBERNATION) += swsusp_85xx.o else obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o endif @@ -122,7 +122,7 @@ extra-$(CONFIG_PPC64) := head_64.o extra-$(CONFIG_PPC_BOOK3S_32) := head_book3s_32.o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o -extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o +extra-$(CONFIG_PPC_85xx) := head_85xx.o extra-$(CONFIG_PPC_8xx) := head_8xx.o extra-y += vmlinux.lds diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S deleted file mode 100644 index dedc17fac8f8..000000000000 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* 1. Find the index of the entry we're executing in */ - bcl 20,31,$+4 /* Find our address */ -invstr: mflr r6 /* Make it accessible */ - mfmsr r7 - rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ - mfspr r7, SPRN_PID0 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* search MSR[IS], SPID=PID0 */ - mfspr r7,SPRN_MAS1 - andis. r7,r7,MAS1_VALID@h - bne match_TLB - - mfspr r7,SPRN_MMUCFG - rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ - cmpwi r7,3 - bne match_TLB /* skip if NPIDS != 3 */ - - mfspr r7,SPRN_PID1 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* search MSR[IS], SPID=PID1 */ - mfspr r7,SPRN_MAS1 - andis. r7,r7,MAS1_VALID@h - bne match_TLB - mfspr r7, SPRN_PID2 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* Fall through, we had to match */ - -match_TLB: - mfspr r7,SPRN_MAS0 - rlwinm r3,r7,16,20,31 /* Extract MAS0(Entry) */ - - mfspr r7,SPRN_MAS1 /* Insure IPROT set */ - oris r7,r7,MAS1_IPROT@h - mtspr SPRN_MAS1,r7 - tlbwe - -/* 2. Invalidate all entries except the entry we're executing in */ - mfspr r9,SPRN_TLB1CFG - andi. r9,r9,0xfff - li r6,0 /* Set Entry counter to 0 */ -1: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ - mtspr SPRN_MAS0,r7 - tlbre - mfspr r7,SPRN_MAS1 - rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ - cmpw r3,r6 - beq skpinv /* Dont update the current execution TLB */ - mtspr SPRN_MAS1,r7 - tlbwe - isync -skpinv: addi r6,r6,1 /* Increment */ - cmpw r6,r9 /* Are we done? */ - bne 1b /* If not, repeat */ - - /* Invalidate TLB0 */ - li r6,0x04 - tlbivax 0,r6 - TLBSYNC - /* Invalidate TLB1 */ - li r6,0x0c - tlbivax 0,r6 - TLBSYNC - -/* 3. Setup a temp mapping and jump to it */ - andi. r5, r3, 0x1 /* Find an entry not used and is non-zero */ - addi r5, r5, 0x1 - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r7 - tlbre - - /* grab and fixup the RPN */ - mfspr r6,SPRN_MAS1 /* extract MAS1[SIZE] */ - rlwinm r6,r6,25,27,31 - li r8,-1 - addi r6,r6,10 - slw r6,r8,r6 /* convert to mask */ - - bcl 20,31,$+4 /* Find our address */ -1: mflr r7 - - mfspr r8,SPRN_MAS3 -#ifdef CONFIG_PHYS_64BIT - mfspr r23,SPRN_MAS7 -#endif - and r8,r6,r8 - subfic r9,r6,-4096 - and r9,r9,r7 - - or r25,r8,r9 - ori r8,r25,(MAS3_SX|MAS3_SW|MAS3_SR) - - /* Just modify the entry ID and EPN for the temp mapping */ - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ - mtspr SPRN_MAS0,r7 - xori r6,r4,1 /* Setup TMP mapping in the other Address space */ - slwi r6,r6,12 - oris r6,r6,(MAS1_VALID|MAS1_IPROT)@h - ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_4K))@l - mtspr SPRN_MAS1,r6 - mfspr r6,SPRN_MAS2 - li r7,0 /* temp EPN = 0 */ - rlwimi r7,r6,0,20,31 - mtspr SPRN_MAS2,r7 - mtspr SPRN_MAS3,r8 - tlbwe - - xori r6,r4,1 - slwi r6,r6,5 /* setup new context with other address space */ - bcl 20,31,$+4 /* Find our address */ -1: mflr r9 - rlwimi r7,r9,0,20,31 - addi r7,r7,(2f - 1b) - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r6 - rfi -2: -/* 4. Clear out PIDs & Search info */ - li r6,0 - mtspr SPRN_MAS6,r6 - mtspr SPRN_PID0,r6 - - mfspr r7,SPRN_MMUCFG - rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ - cmpwi r7,3 - bne 2f /* skip if NPIDS != 3 */ - - mtspr SPRN_PID1,r6 - mtspr SPRN_PID2,r6 - -/* 5. Invalidate mapping we started in */ -2: - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r7 - tlbre - mfspr r6,SPRN_MAS1 - rlwinm r6,r6,0,2,0 /* clear IPROT */ - mtspr SPRN_MAS1,r6 - tlbwe - /* Invalidate TLB1 */ - li r9,0x0c - tlbivax 0,r9 - TLBSYNC - -#if defined(ENTRY_MAPPING_BOOT_SETUP) - -/* 6. Setup kernstart_virt_addr mapping in TLB1[0] */ - lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ - mtspr SPRN_MAS0,r6 - lis r6,(MAS1_VALID|MAS1_IPROT)@h - ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l - mtspr SPRN_MAS1,r6 - lis r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h - ori r6,r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l - and r6,r6,r20 - ori r6,r6,MAS2_M_IF_NEEDED@l - mtspr SPRN_MAS2,r6 - mtspr SPRN_MAS3,r8 - tlbwe - -/* 7. Jump to kernstart_virt_addr mapping */ - mr r6,r20 - -#elif defined(ENTRY_MAPPING_KEXEC_SETUP) -/* - * 6. Setup a 1:1 mapping in TLB1. Esel 0 is unsued, 1 or 2 contains the tmp - * mapping so we start at 3. We setup 8 mappings, each 256MiB in size. This - * will cover the first 2GiB of memory. - */ - - lis r10, (MAS1_VALID|MAS1_IPROT)@h - ori r10,r10, (MAS1_TSIZE(BOOK3E_PAGESZ_256M))@l - li r11, 0 - li r0, 8 - mtctr r0 - -next_tlb_setup: - addi r0, r11, 3 - rlwinm r0, r0, 16, 4, 15 // Compute esel - rlwinm r9, r11, 28, 0, 3 // Compute [ER]PN - oris r0, r0, (MAS0_TLBSEL(1))@h - mtspr SPRN_MAS0,r0 - mtspr SPRN_MAS1,r10 - mtspr SPRN_MAS2,r9 - ori r9, r9, (MAS3_SX|MAS3_SW|MAS3_SR) - mtspr SPRN_MAS3,r9 - tlbwe - addi r11, r11, 1 - bdnz+ next_tlb_setup - -/* 7. Jump to our 1:1 mapping */ - mr r6, r25 -#else - #error You need to specify the mapping or not use this at all. -#endif - - lis r7,MSR_KERNEL@h - ori r7,r7,MSR_KERNEL@l - bcl 20,31,$+4 /* Find our address */ -1: mflr r9 - rlwimi r6,r9,0,20,31 - addi r6,r6,(2f - 1b) - mtspr SPRN_SRR0,r6 - mtspr SPRN_SRR1,r7 - rfi /* start execution out of TLB1[0] entry */ - -/* 8. Clear out the temp mapping */ -2: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ - mtspr SPRN_MAS0,r7 - tlbre - mfspr r8,SPRN_MAS1 - rlwinm r8,r8,0,2,0 /* clear IPROT */ - mtspr SPRN_MAS1,r8 - tlbwe - /* Invalidate TLB1 */ - li r9,0x0c - tlbivax 0,r9 - TLBSYNC diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S new file mode 100644 index 000000000000..48b168b5dc57 --- /dev/null +++ b/arch/powerpc/kernel/head_85xx.S @@ -0,0 +1,1227 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Kernel execution entry point code. + * + * Copyright (c) 1995-1996 Gary Thomas + * Initial PowerPC version. + * Copyright (c) 1996 Cort Dougan + * Rewritten for PReP + * Copyright (c) 1996 Paul Mackerras + * Low-level exception handers, MMU support, and rewrite. + * Copyright (c) 1997 Dan Malek + * PowerPC 8xx modifications. + * Copyright (c) 1998-1999 TiVo, Inc. + * PowerPC 403GCX modifications. + * Copyright (c) 1999 Grant Erickson + * PowerPC 403GCX/405GP modifications. + * Copyright 2000 MontaVista Software Inc. + * PPC405 modifications + * PowerPC 403GCX/405GP modifications. + * Author: MontaVista Software, Inc. + * frank_rowand@mvista.com or source@mvista.com + * debbie_chu@mvista.com + * Copyright 2002-2004 MontaVista Software, Inc. + * PowerPC 44x support, Matt Porter + * Copyright 2004 Freescale Semiconductor, Inc + * PowerPC e500 modifications, Kumar Gala + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "head_booke.h" + +/* As with the other PowerPC ports, it is expected that when code + * execution begins here, the following registers contain valid, yet + * optional, information: + * + * r3 - Board info structure pointer (DRAM, frequency, MAC address, etc.) + * r4 - Starting address of the init RAM disk + * r5 - Ending address of the init RAM disk + * r6 - Start of kernel command line string (e.g. "mem=128") + * r7 - End of kernel command line string + * + */ + __HEAD +_GLOBAL(_stext); +_GLOBAL(_start); + /* + * Reserve a word at a fixed location to store the address + * of abatron_pteptrs + */ + nop + + /* Translate device tree address to physical, save in r30/r31 */ + bl get_phys_addr + mr r30,r3 + mr r31,r4 + + li r25,0 /* phys kernel start (low) */ + li r24,0 /* CPU number */ + li r23,0 /* phys kernel start (high) */ + +#ifdef CONFIG_RELOCATABLE + LOAD_REG_ADDR_PIC(r3, _stext) /* Get our current runtime base */ + + /* Translate _stext address to physical, save in r23/r25 */ + bl get_phys_addr + mr r23,r3 + mr r25,r4 + + bcl 20,31,$+4 +0: mflr r8 + addis r3,r8,(is_second_reloc - 0b)@ha + lwz r19,(is_second_reloc - 0b)@l(r3) + + /* Check if this is the second relocation. */ + cmpwi r19,1 + bne 1f + + /* + * For the second relocation, we already get the real memstart_addr + * from device tree. So we will map PAGE_OFFSET to memstart_addr, + * then the virtual address of start kernel should be: + * PAGE_OFFSET + (kernstart_addr - memstart_addr) + * Since the offset between kernstart_addr and memstart_addr should + * never be beyond 1G, so we can just use the lower 32bit of them + * for the calculation. + */ + lis r3,PAGE_OFFSET@h + + addis r4,r8,(kernstart_addr - 0b)@ha + addi r4,r4,(kernstart_addr - 0b)@l + lwz r5,4(r4) + + addis r6,r8,(memstart_addr - 0b)@ha + addi r6,r6,(memstart_addr - 0b)@l + lwz r7,4(r6) + + subf r5,r7,r5 + add r3,r3,r5 + b 2f + +1: + /* + * We have the runtime (virtual) address of our base. + * We calculate our shift of offset from a 64M page. + * We could map the 64M page we belong to at PAGE_OFFSET and + * get going from there. + */ + lis r4,KERNELBASE@h + ori r4,r4,KERNELBASE@l + rlwinm r6,r25,0,0x3ffffff /* r6 = PHYS_START % 64M */ + rlwinm r5,r4,0,0x3ffffff /* r5 = KERNELBASE % 64M */ + subf r3,r5,r6 /* r3 = r6 - r5 */ + add r3,r4,r3 /* Required Virtual Address */ + +2: bl relocate + + /* + * For the second relocation, we already set the right tlb entries + * for the kernel space, so skip the code in 85xx_entry_mapping.S + */ + cmpwi r19,1 + beq set_ivor +#endif + +/* We try to not make any assumptions about how the boot loader + * setup or used the TLBs. We invalidate all mappings from the + * boot loader and load a single entry in TLB1[0] to map the + * first 64M of kernel memory. Any boot info passed from the + * bootloader needs to live in this first 64M. + * + * Requirement on bootloader: + * - The page we're executing in needs to reside in TLB1 and + * have IPROT=1. If not an invalidate broadcast could + * evict the entry we're currently executing in. + * + * r3 = Index of TLB1 were executing in + * r4 = Current MSR[IS] + * r5 = Index of TLB1 temp mapping + * + * Later in mapin_ram we will correctly map lowmem, and resize TLB1[0] + * if needed + */ + +_GLOBAL(__early_start) + LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) + lwz r20,0(r20) + +#define ENTRY_MAPPING_BOOT_SETUP +#include "85xx_entry_mapping.S" +#undef ENTRY_MAPPING_BOOT_SETUP + +set_ivor: + /* Establish the interrupt vector offsets */ + SET_IVOR(0, CriticalInput); + SET_IVOR(1, MachineCheck); + SET_IVOR(2, DataStorage); + SET_IVOR(3, InstructionStorage); + SET_IVOR(4, ExternalInput); + SET_IVOR(5, Alignment); + SET_IVOR(6, Program); + SET_IVOR(7, FloatingPointUnavailable); + SET_IVOR(8, SystemCall); + SET_IVOR(9, AuxillaryProcessorUnavailable); + SET_IVOR(10, Decrementer); + SET_IVOR(11, FixedIntervalTimer); + SET_IVOR(12, WatchdogTimer); + SET_IVOR(13, DataTLBError); + SET_IVOR(14, InstructionTLBError); + SET_IVOR(15, DebugCrit); + + /* Establish the interrupt vector base */ + lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */ + mtspr SPRN_IVPR,r4 + + /* Setup the defaults for TLB entries */ + li r2,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS4, r2 + +#if !defined(CONFIG_BDI_SWITCH) + /* + * The Abatron BDI JTAG debugger does not tolerate others + * mucking with the debug registers. + */ + lis r2,DBCR0_IDM@h + mtspr SPRN_DBCR0,r2 + isync + /* clear any residual debug events */ + li r2,-1 + mtspr SPRN_DBSR,r2 +#endif + +#ifdef CONFIG_SMP + /* Check to see if we're the second processor, and jump + * to the secondary_start code if so + */ + LOAD_REG_ADDR_PIC(r24, boot_cpuid) + lwz r24, 0(r24) + cmpwi r24, -1 + mfspr r24,SPRN_PIR + bne __secondary_start +#endif + + /* + * This is where the main kernel code starts. + */ + + /* ptr to current */ + lis r2,init_task@h + ori r2,r2,init_task@l + + /* ptr to current thread */ + addi r4,r2,THREAD /* init task's THREAD */ + mtspr SPRN_SPRG_THREAD,r4 + + /* stack */ + lis r1,init_thread_union@h + ori r1,r1,init_thread_union@l + li r0,0 + stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + +#ifdef CONFIG_SMP + stw r24, TASK_CPU(r2) +#endif + + bl early_init + +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif +#ifdef CONFIG_RELOCATABLE + mr r3,r30 + mr r4,r31 +#ifdef CONFIG_PHYS_64BIT + mr r5,r23 + mr r6,r25 +#else + mr r5,r25 +#endif + bl relocate_init +#endif + +#ifdef CONFIG_DYNAMIC_MEMSTART + lis r3,kernstart_addr@ha + la r3,kernstart_addr@l(r3) +#ifdef CONFIG_PHYS_64BIT + stw r23,0(r3) + stw r25,4(r3) +#else + stw r25,0(r3) +#endif +#endif + +/* + * Decide what sort of machine this is and initialize the MMU. + */ + mr r3,r30 + mr r4,r31 + bl machine_init + bl MMU_init + + /* Setup PTE pointers for the Abatron bdiGDB */ + lis r6, swapper_pg_dir@h + ori r6, r6, swapper_pg_dir@l + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + lis r3, kernstart_virt_addr@ha + lwz r4, kernstart_virt_addr@l(r3) + stw r5, 0(r4) /* Save abatron_pteptrs at a fixed location */ + stw r6, 0(r5) + + /* Let's move on */ + lis r4,start_kernel@h + ori r4,r4,start_kernel@l + lis r3,MSR_KERNEL@h + ori r3,r3,MSR_KERNEL@l + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r3 + rfi /* change context and jump to start_kernel */ + +/* Macros to hide the PTE size differences + * + * FIND_PTE -- walks the page tables given EA & pgdir pointer + * r10 -- EA of fault + * r11 -- PGDIR pointer + * r12 -- free + * label 2: is the bailout case + * + * if we find the pte (fall through): + * r11 is low pte word + * r12 is pointer to the pte + * r10 is the pshift from the PGD, if we're a hugepage + */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_HUGETLB_PAGE +#define FIND_PTE \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ + blt 1000f; /* Normal non-huge page */ \ + beq 2f; /* Bail if no table */ \ + oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ + andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ + xor r12, r10, r11; /* drop size bits from pointer */ \ + b 1001f; \ +1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + li r10, 0; /* clear r10 */ \ +1001: lwz r11, 4(r12); /* Get pte entry */ +#else +#define FIND_PTE \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ + beq 2f; /* Bail if no table */ \ + rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + lwz r11, 4(r12); /* Get pte entry */ +#endif /* HUGEPAGE */ +#else /* !PTE_64BIT */ +#define FIND_PTE \ + rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ + lwz r11, 0(r11); /* Get L1 entry */ \ + rlwinm. r12, r11, 0, 0, 19; /* Extract L2 (pte) base address */ \ + beq 2f; /* Bail if no table */ \ + rlwimi r12, r10, 22, 20, 29; /* Compute PTE address */ \ + lwz r11, 0(r12); /* Get Linux PTE */ +#endif + +/* + * Interrupt vector entry code + * + * The Book E MMUs are always on so we don't need to handle + * interrupts in real mode as with previous PPC processors. In + * this case we handle interrupts in the kernel virtual address + * space. + * + * Interrupt vectors are dynamically placed relative to the + * interrupt prefix as determined by the address of interrupt_base. + * The interrupt vectors offsets are programmed using the labels + * for each interrupt vector entry. + * + * Interrupt vectors must be aligned on a 16 byte boundary. + * We align on a 32 byte cache line boundary for good measure. + */ + +interrupt_base: + /* Critical Input Interrupt */ + CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) + + /* Machine Check Interrupt */ + MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) + + /* Data Storage Interrupt */ + START_EXCEPTION(DataStorage) + NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE) + mfspr r5,SPRN_ESR /* Grab the ESR, save it */ + stw r5,_ESR(r11) + mfspr r4,SPRN_DEAR /* Grab the DEAR, save it */ + stw r4, _DEAR(r11) + andis. r10,r5,(ESR_ILK|ESR_DLK)@h + bne 1f + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return +1: + prepare_transfer_to_handler + bl CacheLockingException + b interrupt_return + + /* Instruction Storage Interrupt */ + INSTRUCTION_STORAGE_EXCEPTION + + /* External Input Interrupt */ + EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ) + + /* Alignment Interrupt */ + ALIGNMENT_EXCEPTION + + /* Program Interrupt */ + PROGRAM_EXCEPTION + + /* Floating Point Unavailable Interrupt */ +#ifdef CONFIG_PPC_FPU + FP_UNAVAILABLE_EXCEPTION +#else + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, unknown_exception) +#endif + + /* System Call Interrupt */ + START_EXCEPTION(SystemCall) + SYSCALL_ENTRY 0xc00 BOOKE_INTERRUPT_SYSCALL SPRN_SRR1 + + /* Auxiliary Processor Unavailable Interrupt */ + EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, unknown_exception) + + /* Decrementer Interrupt */ + DECREMENTER_EXCEPTION + + /* Fixed Internal Timer Interrupt */ + /* TODO: Add FIT support */ + EXCEPTION(0x3100, FIT, FixedIntervalTimer, unknown_exception) + + /* Watchdog Timer Interrupt */ +#ifdef CONFIG_BOOKE_WDT + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException) +#else + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception) +#endif + + /* Data TLB Error Interrupt */ + START_EXCEPTION(DataTLBError) + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mfspr r10, SPRN_SPRG_THREAD + stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + stw r12, THREAD_NORMSAVE(1)(r10) + stw r13, THREAD_NORMSAVE(2)(r10) + mfcr r13 + stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1 +START_BTB_FLUSH_SECTION + mfspr r11, SPRN_SRR1 + andi. r10,r11,MSR_PR + beq 1f + BTB_FLUSH(r10) +1: +END_BTB_FLUSH_SECTION + mfspr r10, SPRN_DEAR /* Get faulting address */ + + /* If we are faulting a kernel address, we have to use the + * kernel page tables. + */ + lis r11, PAGE_OFFSET@h + cmplw 5, r10, r11 + blt 5, 3f + lis r11, swapper_pg_dir@h + ori r11, r11, swapper_pg_dir@l + + mfspr r12,SPRN_MAS1 /* Set TID to 0 */ + rlwinm r12,r12,0,16,1 + mtspr SPRN_MAS1,r12 + + b 4f + + /* Get the PGD for the current thread */ +3: + mfspr r11,SPRN_SPRG_THREAD + lwz r11,PGDIR(r11) + +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + +4: + /* Mask of required permission bits. Note that while we + * do copy ESR:ST to _PAGE_RW position as trying to write + * to an RO page is pretty common, we don't do it with + * _PAGE_DIRTY. We could do it, but it's a fairly rare + * event so I'd rather take the overhead when it happens + * rather than adding an instruction here. We should measure + * whether the whole thing is worth it in the first place + * as we could avoid loading SPRN_ESR completely in the first + * place... + * + * TODO: Is it worth doing that mfspr & rlwimi in the first + * place or can we save a couple of instructions here ? + */ + mfspr r12,SPRN_ESR +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT|_PAGE_ACCESSED +#endif + rlwimi r13,r12,11,29,29 + + FIND_PTE + andc. r13,r13,r11 /* Check permission */ + +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ +#else + lwz r13,0(r12) /* Get upper pte bits */ +#endif +#endif + + bne 2f /* Bail if permission/valid mismatch */ + + /* Jump to common tlb load */ + b finish_tlb_load +2: + /* The bailout. Restore registers to pre-exception conditions + * and call the heavyweights to help us out. + */ + mfspr r10, SPRN_SPRG_THREAD + lwz r11, THREAD_NORMSAVE(3)(r10) + mtcr r11 + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 + b DataStorage + + /* Instruction TLB Error Interrupt */ + /* + * Nearly the same as above, except we get our + * information from different registers and bailout + * to a different point. + */ + START_EXCEPTION(InstructionTLBError) + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mfspr r10, SPRN_SPRG_THREAD + stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + stw r12, THREAD_NORMSAVE(1)(r10) + stw r13, THREAD_NORMSAVE(2)(r10) + mfcr r13 + stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1 +START_BTB_FLUSH_SECTION + mfspr r11, SPRN_SRR1 + andi. r10,r11,MSR_PR + beq 1f + BTB_FLUSH(r10) +1: +END_BTB_FLUSH_SECTION + + mfspr r10, SPRN_SRR0 /* Get faulting address */ + + /* If we are faulting a kernel address, we have to use the + * kernel page tables. + */ + lis r11, PAGE_OFFSET@h + cmplw 5, r10, r11 + blt 5, 3f + lis r11, swapper_pg_dir@h + ori r11, r11, swapper_pg_dir@l + + mfspr r12,SPRN_MAS1 /* Set TID to 0 */ + rlwinm r12,r12,0,16,1 + mtspr SPRN_MAS1,r12 + + /* Make up the required permissions for kernel code */ +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT | _PAGE_BAP_SX + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#endif + b 4f + + /* Get the PGD for the current thread */ +3: + mfspr r11,SPRN_SPRG_THREAD + lwz r11,PGDIR(r11) + +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + + /* Make up the required permissions for user code */ +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT | _PAGE_BAP_UX + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#endif + +4: + FIND_PTE + andc. r13,r13,r11 /* Check permission */ + +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ +#else + lwz r13,0(r12) /* Get upper pte bits */ +#endif +#endif + + bne 2f /* Bail if permission mismatch */ + + /* Jump to common TLB load point */ + b finish_tlb_load + +2: + /* The bailout. Restore registers to pre-exception conditions + * and call the heavyweights to help us out. + */ + mfspr r10, SPRN_SPRG_THREAD + lwz r11, THREAD_NORMSAVE(3)(r10) + mtcr r11 + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 + b InstructionStorage + +/* Define SPE handlers for e500v2 */ +#ifdef CONFIG_SPE + /* SPE Unavailable */ + START_EXCEPTION(SPEUnavailable) + NORMAL_EXCEPTION_PROLOG(0x2010, SPE_UNAVAIL) + beq 1f + bl load_up_spe + b fast_exception_return +1: prepare_transfer_to_handler + bl KernelSPE + b interrupt_return +#elif defined(CONFIG_SPE_POSSIBLE) + EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, unknown_exception) +#endif /* CONFIG_SPE_POSSIBLE */ + + /* SPE Floating Point Data */ +#ifdef CONFIG_SPE + START_EXCEPTION(SPEFloatingPointData) + NORMAL_EXCEPTION_PROLOG(0x2030, SPE_FP_DATA) + prepare_transfer_to_handler + bl SPEFloatingPointException + REST_NVGPRS(r1) + b interrupt_return + + /* SPE Floating Point Round */ + START_EXCEPTION(SPEFloatingPointRound) + NORMAL_EXCEPTION_PROLOG(0x2050, SPE_FP_ROUND) + prepare_transfer_to_handler + bl SPEFloatingPointRoundException + REST_NVGPRS(r1) + b interrupt_return +#elif defined(CONFIG_SPE_POSSIBLE) + EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, unknown_exception) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, unknown_exception) +#endif /* CONFIG_SPE_POSSIBLE */ + + + /* Performance Monitor */ + EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ + performance_monitor_exception) + + EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception) + + CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ + CriticalDoorbell, unknown_exception) + + /* Debug Interrupt */ + DEBUG_DEBUG_EXCEPTION + DEBUG_CRIT_EXCEPTION + + GUEST_DOORBELL_EXCEPTION + + CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \ + unknown_exception) + + /* Hypercall */ + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception) + + /* Embedded Hypervisor Privilege */ + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception) + +interrupt_end: + +/* + * Local functions + */ + +/* + * Both the instruction and data TLB miss get to this + * point to load the TLB. + * r10 - tsize encoding (if HUGETLB_PAGE) or available to use + * r11 - TLB (info from Linux PTE) + * r12 - available to use + * r13 - upper bits of PTE (if PTE_64BIT) or available to use + * CR5 - results of addr >= PAGE_OFFSET + * MAS0, MAS1 - loaded with proper value when we get here + * MAS2, MAS3 - will need additional info from Linux PTE + * Upon exit, we reload everything and RFI. + */ +finish_tlb_load: +#ifdef CONFIG_HUGETLB_PAGE + cmpwi 6, r10, 0 /* check for huge page */ + beq 6, finish_tlb_load_cont /* !huge */ + + /* Alas, we need more scratch registers for hugepages */ + mfspr r12, SPRN_SPRG_THREAD + stw r14, THREAD_NORMSAVE(4)(r12) + stw r15, THREAD_NORMSAVE(5)(r12) + stw r16, THREAD_NORMSAVE(6)(r12) + stw r17, THREAD_NORMSAVE(7)(r12) + + /* Get the next_tlbcam_idx percpu var */ +#ifdef CONFIG_SMP + lwz r15, TASK_CPU-THREAD(r12) + lis r14, __per_cpu_offset@h + ori r14, r14, __per_cpu_offset@l + rlwinm r15, r15, 2, 0, 29 + lwzx r16, r14, r15 +#else + li r16, 0 +#endif + lis r17, next_tlbcam_idx@h + ori r17, r17, next_tlbcam_idx@l + add r17, r17, r16 /* r17 = *next_tlbcam_idx */ + lwz r15, 0(r17) /* r15 = next_tlbcam_idx */ + + lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */ + rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */ + mtspr SPRN_MAS0, r14 + + /* Extract TLB1CFG(NENTRY) */ + mfspr r16, SPRN_TLB1CFG + andi. r16, r16, 0xfff + + /* Update next_tlbcam_idx, wrapping when necessary */ + addi r15, r15, 1 + cmpw r15, r16 + blt 100f + lis r14, tlbcam_index@h + ori r14, r14, tlbcam_index@l + lwz r15, 0(r14) +100: stw r15, 0(r17) + + /* + * Calc MAS1_TSIZE from r10 (which has pshift encoded) + * tlb_enc = (pshift - 10). + */ + subi r15, r10, 10 + mfspr r16, SPRN_MAS1 + rlwimi r16, r15, 7, 20, 24 + mtspr SPRN_MAS1, r16 + + /* copy the pshift for use later */ + mr r14, r10 + + /* fall through */ + +#endif /* CONFIG_HUGETLB_PAGE */ + + /* + * We set execute, because we don't have the granularity to + * properly set this at the page level (Linux problem). + * Many of these bits are software only. Bits we don't set + * here we (properly should) assume have the appropriate value. + */ +finish_tlb_load_cont: +#ifdef CONFIG_PTE_64BIT + rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ + andi. r10, r11, _PAGE_DIRTY + bne 1f + li r10, MAS3_SW | MAS3_UW + andc r12, r12, r10 +1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ + rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ +2: mtspr SPRN_MAS3, r12 +BEGIN_MMU_FTR_SECTION + srwi r10, r13, 12 /* grab RPN[12:31] */ + mtspr SPRN_MAS7, r10 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) +#else + li r10, (_PAGE_EXEC | _PAGE_PRESENT) + mr r13, r11 + rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ + and r12, r11, r10 + andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ + slwi r10, r12, 1 + or r10, r10, r12 + rlwinm r10, r10, 0, ~_PAGE_EXEC /* Clear SX on user pages */ + iseleq r12, r12, r10 + rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ + mtspr SPRN_MAS3, r13 +#endif + + mfspr r12, SPRN_MAS2 +#ifdef CONFIG_PTE_64BIT + rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ +#else + rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ +#endif +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 3f /* don't mask if page isn't huge */ + li r13, 1 + slw r13, r13, r14 + subi r13, r13, 1 + rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */ + andc r12, r12, r13 /* mask off ea bits within the page */ +#endif +3: mtspr SPRN_MAS2, r12 + +tlb_write_entry: + tlbwe + + /* Done...restore registers and get out of here. */ + mfspr r10, SPRN_SPRG_THREAD +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 8f /* skip restore for 4k page faults */ + lwz r14, THREAD_NORMSAVE(4)(r10) + lwz r15, THREAD_NORMSAVE(5)(r10) + lwz r16, THREAD_NORMSAVE(6)(r10) + lwz r17, THREAD_NORMSAVE(7)(r10) +#endif +8: lwz r11, THREAD_NORMSAVE(3)(r10) + mtcr r11 + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 + rfi /* Force context change */ + +#ifdef CONFIG_SPE +/* Note that the SPE support is closely modeled after the AltiVec + * support. Changes to one are likely to be applicable to the + * other! */ +_GLOBAL(load_up_spe) +/* + * Disable SPE for the task which had SPE previously, + * and save its SPE registers in its thread_struct. + * Enables SPE for use in the kernel on return. + * On SMP we know the SPE units are free, since we give it up every + * switch. -- Kumar + */ + mfmsr r5 + oris r5,r5,MSR_SPE@h + mtmsr r5 /* enable use of SPE now */ + isync + /* enable use of SPE after return */ + oris r9,r9,MSR_SPE@h + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + li r4,1 + li r10,THREAD_ACC + stw r4,THREAD_USED_SPE(r5) + evlddx evr4,r10,r5 + evmra evr4,evr4 + REST_32EVRS(0,r10,r5,THREAD_EVR0) + blr + +/* + * SPE unavailable trap from kernel - print a message, but let + * the task use SPE in the kernel until it returns to user mode. + */ +KernelSPE: + lwz r3,_MSR(r1) + oris r3,r3,MSR_SPE@h + stw r3,_MSR(r1) /* enable use of SPE after return */ +#ifdef CONFIG_PRINTK + lis r3,87f@h + ori r3,r3,87f@l + mr r4,r2 /* current */ + lwz r5,_NIP(r1) + bl _printk +#endif + b interrupt_return +#ifdef CONFIG_PRINTK +87: .string "SPE used in kernel (task=%p, pc=%x) \n" +#endif + .align 4,0 + +#endif /* CONFIG_SPE */ + +/* + * Translate the effec addr in r3 to phys addr. The phys addr will be put + * into r3(higher 32bit) and r4(lower 32bit) + */ +get_phys_addr: + mfmsr r8 + mfspr r9,SPRN_PID + rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + rlwimi r9,r8,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ + mtspr SPRN_MAS6,r9 + + tlbsx 0,r3 /* must succeed */ + + mfspr r8,SPRN_MAS1 + mfspr r12,SPRN_MAS3 + rlwinm r9,r8,25,0x1f /* r9 = log2(page size) */ + li r10,1024 + slw r10,r10,r9 /* r10 = page size */ + addi r10,r10,-1 + and r11,r3,r10 /* r11 = page offset */ + andc r4,r12,r10 /* r4 = page base */ + or r4,r4,r11 /* r4 = devtree phys addr */ +#ifdef CONFIG_PHYS_64BIT + mfspr r3,SPRN_MAS7 +#endif + blr + +/* + * Global functions + */ + +#ifdef CONFIG_E500 +#ifndef CONFIG_PPC_E500MC +/* Adjust or setup IVORs for e500v1/v2 */ +_GLOBAL(__setup_e500_ivors) + li r3,DebugCrit@l + mtspr SPRN_IVOR15,r3 + li r3,SPEUnavailable@l + mtspr SPRN_IVOR32,r3 + li r3,SPEFloatingPointData@l + mtspr SPRN_IVOR33,r3 + li r3,SPEFloatingPointRound@l + mtspr SPRN_IVOR34,r3 + li r3,PerformanceMonitor@l + mtspr SPRN_IVOR35,r3 + sync + blr +#else +/* Adjust or setup IVORs for e500mc */ +_GLOBAL(__setup_e500mc_ivors) + li r3,DebugDebug@l + mtspr SPRN_IVOR15,r3 + li r3,PerformanceMonitor@l + mtspr SPRN_IVOR35,r3 + li r3,Doorbell@l + mtspr SPRN_IVOR36,r3 + li r3,CriticalDoorbell@l + mtspr SPRN_IVOR37,r3 + sync + blr + +/* setup ehv ivors for */ +_GLOBAL(__setup_ehv_ivors) + li r3,GuestDoorbell@l + mtspr SPRN_IVOR38,r3 + li r3,CriticalGuestDoorbell@l + mtspr SPRN_IVOR39,r3 + li r3,Hypercall@l + mtspr SPRN_IVOR40,r3 + li r3,Ehvpriv@l + mtspr SPRN_IVOR41,r3 + sync + blr +#endif /* CONFIG_PPC_E500MC */ +#endif /* CONFIG_E500 */ + +#ifdef CONFIG_SPE +/* + * extern void __giveup_spe(struct task_struct *prev) + * + */ +_GLOBAL(__giveup_spe) + addi r3,r3,THREAD /* want THREAD of task */ + lwz r5,PT_REGS(r3) + cmpi 0,r5,0 + SAVE_32EVRS(0, r4, r3, THREAD_EVR0) + evxor evr6, evr6, evr6 /* clear out evr6 */ + evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ + li r4,THREAD_ACC + evstddx evr6, r4, r3 /* save off accumulator */ + beq 1f + lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r3,MSR_SPE@h + andc r4,r4,r3 /* disable SPE for previous task */ + stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: + blr +#endif /* CONFIG_SPE */ + +/* + * extern void abort(void) + * + * At present, this routine just applies a system reset. + */ +_GLOBAL(abort) + li r13,0 + mtspr SPRN_DBCR0,r13 /* disable all debug events */ + isync + mfmsr r13 + ori r13,r13,MSR_DE@l /* Enable Debug Events */ + mtmsr r13 + isync + mfspr r13,SPRN_DBCR0 + lis r13,(DBCR0_IDM|DBCR0_RST_CHIP)@h + mtspr SPRN_DBCR0,r13 + isync + +#ifdef CONFIG_SMP +/* When we get here, r24 needs to hold the CPU # */ + .globl __secondary_start +__secondary_start: + LOAD_REG_ADDR_PIC(r3, tlbcam_index) + lwz r3,0(r3) + mtctr r3 + li r26,0 /* r26 safe? */ + + bl switch_to_as1 + mr r27,r3 /* tlb entry */ + /* Load each CAM entry */ +1: mr r3,r26 + bl loadcam_entry + addi r26,r26,1 + bdnz 1b + mr r3,r27 /* tlb entry */ + LOAD_REG_ADDR_PIC(r4, memstart_addr) + lwz r4,0(r4) + mr r5,r25 /* phys kernel start */ + rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ + subf r4,r5,r4 /* memstart_addr - phys kernel start */ + lis r7,KERNELBASE@h + ori r7,r7,KERNELBASE@l + cmpw r20,r7 /* if kernstart_virt_addr != KERNELBASE, randomized */ + beq 2f + li r4,0 +2: li r5,0 /* no device tree */ + li r6,0 /* not boot cpu */ + bl restore_to_as0 + + + lis r3,__secondary_hold_acknowledge@h + ori r3,r3,__secondary_hold_acknowledge@l + stw r24,0(r3) + + li r3,0 + mr r4,r24 /* Why? */ + bl call_setup_cpu + + /* get current's stack and current */ + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) + lwz r1,TASK_STACK(r2) + + /* stack */ + addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + li r0,0 + stw r0,0(r1) + + /* ptr to current thread */ + addi r4,r2,THREAD /* address of our thread_struct */ + mtspr SPRN_SPRG_THREAD,r4 + + /* Setup the defaults for TLB entries */ + li r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS4,r4 + + /* Jump to start_secondary */ + lis r4,MSR_KERNEL@h + ori r4,r4,MSR_KERNEL@l + lis r3,start_secondary@h + ori r3,r3,start_secondary@l + mtspr SPRN_SRR0,r3 + mtspr SPRN_SRR1,r4 + sync + rfi + sync + + .globl __secondary_hold_acknowledge +__secondary_hold_acknowledge: + .long -1 +#endif + +/* + * Create a 64M tlb by address and entry + * r3 - entry + * r4 - virtual address + * r5/r6 - physical address + */ +_GLOBAL(create_kaslr_tlb_entry) + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 /* Write MAS0 */ + + lis r3,(MAS1_VALID|MAS1_IPROT)@h + ori r3,r3,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l + mtspr SPRN_MAS1,r3 /* Write MAS1 */ + + lis r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r3,r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r3,r3,r4 + ori r3,r3,MAS2_M_IF_NEEDED@l + mtspr SPRN_MAS2,r3 /* Write MAS2(EPN) */ + +#ifdef CONFIG_PHYS_64BIT + ori r8,r6,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ + mtspr SPRN_MAS7,r5 +#else + ori r8,r5,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ +#endif + + tlbwe /* Write TLB */ + isync + sync + blr + +/* + * Return to the start of the relocated kernel and run again + * r3 - virtual address of fdt + * r4 - entry of the kernel + */ +_GLOBAL(reloc_kernel_entry) + mfmsr r7 + rlwinm r7, r7, 0, ~(MSR_IS | MSR_DS) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r7 + rfi + +/* + * Create a tlb entry with the same effective and physical address as + * the tlb entry used by the current running code. But set the TS to 1. + * Then switch to the address space 1. It will return with the r3 set to + * the ESEL of the new created tlb. + */ +_GLOBAL(switch_to_as1) + mflr r5 + + /* Find a entry not used */ + mfspr r3,SPRN_TLB1CFG + andi. r3,r3,0xfff + mfspr r4,SPRN_PID + rlwinm r4,r4,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + mtspr SPRN_MAS6,r4 +1: lis r4,0x1000 /* Set MAS0(TLBSEL) = 1 */ + addi r3,r3,-1 + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r4,SPRN_MAS1 + andis. r4,r4,MAS1_VALID@h + bne 1b + + /* Get the tlb entry used by the current running code */ + bcl 20,31,$+4 +0: mflr r4 + tlbsx 0,r4 + + mfspr r4,SPRN_MAS1 + ori r4,r4,MAS1_TS /* Set the TS = 1 */ + mtspr SPRN_MAS1,r4 + + mfspr r4,SPRN_MAS0 + rlwinm r4,r4,0,~MAS0_ESEL_MASK + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbwe + isync + sync + + mfmsr r4 + ori r4,r4,MSR_IS | MSR_DS + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r4 + sync + rfi + +/* + * Restore to the address space 0 and also invalidate the tlb entry created + * by switch_to_as1. + * r3 - the tlb entry which should be invalidated + * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0) + * r5 - device tree virtual address. If r4 is 0, r5 is ignored. + * r6 - boot cpu +*/ +_GLOBAL(restore_to_as0) + mflr r0 + + bcl 20,31,$+4 +0: mflr r9 + addi r9,r9,1f - 0b + + /* + * We may map the PAGE_OFFSET in AS0 to a different physical address, + * so we need calculate the right jump and device tree address based + * on the offset passed by r4. + */ + add r9,r9,r4 + add r5,r5,r4 + add r0,r0,r4 + +2: mfmsr r7 + li r8,(MSR_IS | MSR_DS) + andc r7,r7,r8 + + mtspr SPRN_SRR0,r9 + mtspr SPRN_SRR1,r7 + sync + rfi + + /* Invalidate the temporary tlb entry for AS1 */ +1: lis r9,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r9,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r9 + tlbre + mfspr r9,SPRN_MAS1 + rlwinm r9,r9,0,2,31 /* Clear MAS1 Valid and IPPROT */ + mtspr SPRN_MAS1,r9 + tlbwe + isync + + cmpwi r4,0 + cmpwi cr1,r6,0 + cror eq,4*cr1+eq,eq + bne 3f /* offset != 0 && is_boot_cpu */ + mtlr r0 + blr + + /* + * The PAGE_OFFSET will map to a different physical address, + * jump to _start to do another relocation again. + */ +3: mr r3,r5 + bl _start diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S deleted file mode 100644 index f0db4f52bc00..000000000000 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ /dev/null @@ -1,1227 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Kernel execution entry point code. - * - * Copyright (c) 1995-1996 Gary Thomas - * Initial PowerPC version. - * Copyright (c) 1996 Cort Dougan - * Rewritten for PReP - * Copyright (c) 1996 Paul Mackerras - * Low-level exception handers, MMU support, and rewrite. - * Copyright (c) 1997 Dan Malek - * PowerPC 8xx modifications. - * Copyright (c) 1998-1999 TiVo, Inc. - * PowerPC 403GCX modifications. - * Copyright (c) 1999 Grant Erickson - * PowerPC 403GCX/405GP modifications. - * Copyright 2000 MontaVista Software Inc. - * PPC405 modifications - * PowerPC 403GCX/405GP modifications. - * Author: MontaVista Software, Inc. - * frank_rowand@mvista.com or source@mvista.com - * debbie_chu@mvista.com - * Copyright 2002-2004 MontaVista Software, Inc. - * PowerPC 44x support, Matt Porter - * Copyright 2004 Freescale Semiconductor, Inc - * PowerPC e500 modifications, Kumar Gala - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "head_booke.h" - -/* As with the other PowerPC ports, it is expected that when code - * execution begins here, the following registers contain valid, yet - * optional, information: - * - * r3 - Board info structure pointer (DRAM, frequency, MAC address, etc.) - * r4 - Starting address of the init RAM disk - * r5 - Ending address of the init RAM disk - * r6 - Start of kernel command line string (e.g. "mem=128") - * r7 - End of kernel command line string - * - */ - __HEAD -_GLOBAL(_stext); -_GLOBAL(_start); - /* - * Reserve a word at a fixed location to store the address - * of abatron_pteptrs - */ - nop - - /* Translate device tree address to physical, save in r30/r31 */ - bl get_phys_addr - mr r30,r3 - mr r31,r4 - - li r25,0 /* phys kernel start (low) */ - li r24,0 /* CPU number */ - li r23,0 /* phys kernel start (high) */ - -#ifdef CONFIG_RELOCATABLE - LOAD_REG_ADDR_PIC(r3, _stext) /* Get our current runtime base */ - - /* Translate _stext address to physical, save in r23/r25 */ - bl get_phys_addr - mr r23,r3 - mr r25,r4 - - bcl 20,31,$+4 -0: mflr r8 - addis r3,r8,(is_second_reloc - 0b)@ha - lwz r19,(is_second_reloc - 0b)@l(r3) - - /* Check if this is the second relocation. */ - cmpwi r19,1 - bne 1f - - /* - * For the second relocation, we already get the real memstart_addr - * from device tree. So we will map PAGE_OFFSET to memstart_addr, - * then the virtual address of start kernel should be: - * PAGE_OFFSET + (kernstart_addr - memstart_addr) - * Since the offset between kernstart_addr and memstart_addr should - * never be beyond 1G, so we can just use the lower 32bit of them - * for the calculation. - */ - lis r3,PAGE_OFFSET@h - - addis r4,r8,(kernstart_addr - 0b)@ha - addi r4,r4,(kernstart_addr - 0b)@l - lwz r5,4(r4) - - addis r6,r8,(memstart_addr - 0b)@ha - addi r6,r6,(memstart_addr - 0b)@l - lwz r7,4(r6) - - subf r5,r7,r5 - add r3,r3,r5 - b 2f - -1: - /* - * We have the runtime (virtual) address of our base. - * We calculate our shift of offset from a 64M page. - * We could map the 64M page we belong to at PAGE_OFFSET and - * get going from there. - */ - lis r4,KERNELBASE@h - ori r4,r4,KERNELBASE@l - rlwinm r6,r25,0,0x3ffffff /* r6 = PHYS_START % 64M */ - rlwinm r5,r4,0,0x3ffffff /* r5 = KERNELBASE % 64M */ - subf r3,r5,r6 /* r3 = r6 - r5 */ - add r3,r4,r3 /* Required Virtual Address */ - -2: bl relocate - - /* - * For the second relocation, we already set the right tlb entries - * for the kernel space, so skip the code in fsl_booke_entry_mapping.S - */ - cmpwi r19,1 - beq set_ivor -#endif - -/* We try to not make any assumptions about how the boot loader - * setup or used the TLBs. We invalidate all mappings from the - * boot loader and load a single entry in TLB1[0] to map the - * first 64M of kernel memory. Any boot info passed from the - * bootloader needs to live in this first 64M. - * - * Requirement on bootloader: - * - The page we're executing in needs to reside in TLB1 and - * have IPROT=1. If not an invalidate broadcast could - * evict the entry we're currently executing in. - * - * r3 = Index of TLB1 were executing in - * r4 = Current MSR[IS] - * r5 = Index of TLB1 temp mapping - * - * Later in mapin_ram we will correctly map lowmem, and resize TLB1[0] - * if needed - */ - -_GLOBAL(__early_start) - LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) - lwz r20,0(r20) - -#define ENTRY_MAPPING_BOOT_SETUP -#include "fsl_booke_entry_mapping.S" -#undef ENTRY_MAPPING_BOOT_SETUP - -set_ivor: - /* Establish the interrupt vector offsets */ - SET_IVOR(0, CriticalInput); - SET_IVOR(1, MachineCheck); - SET_IVOR(2, DataStorage); - SET_IVOR(3, InstructionStorage); - SET_IVOR(4, ExternalInput); - SET_IVOR(5, Alignment); - SET_IVOR(6, Program); - SET_IVOR(7, FloatingPointUnavailable); - SET_IVOR(8, SystemCall); - SET_IVOR(9, AuxillaryProcessorUnavailable); - SET_IVOR(10, Decrementer); - SET_IVOR(11, FixedIntervalTimer); - SET_IVOR(12, WatchdogTimer); - SET_IVOR(13, DataTLBError); - SET_IVOR(14, InstructionTLBError); - SET_IVOR(15, DebugCrit); - - /* Establish the interrupt vector base */ - lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */ - mtspr SPRN_IVPR,r4 - - /* Setup the defaults for TLB entries */ - li r2,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l - mtspr SPRN_MAS4, r2 - -#if !defined(CONFIG_BDI_SWITCH) - /* - * The Abatron BDI JTAG debugger does not tolerate others - * mucking with the debug registers. - */ - lis r2,DBCR0_IDM@h - mtspr SPRN_DBCR0,r2 - isync - /* clear any residual debug events */ - li r2,-1 - mtspr SPRN_DBSR,r2 -#endif - -#ifdef CONFIG_SMP - /* Check to see if we're the second processor, and jump - * to the secondary_start code if so - */ - LOAD_REG_ADDR_PIC(r24, boot_cpuid) - lwz r24, 0(r24) - cmpwi r24, -1 - mfspr r24,SPRN_PIR - bne __secondary_start -#endif - - /* - * This is where the main kernel code starts. - */ - - /* ptr to current */ - lis r2,init_task@h - ori r2,r2,init_task@l - - /* ptr to current thread */ - addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG_THREAD,r4 - - /* stack */ - lis r1,init_thread_union@h - ori r1,r1,init_thread_union@l - li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) - -#ifdef CONFIG_SMP - stw r24, TASK_CPU(r2) -#endif - - bl early_init - -#ifdef CONFIG_KASAN - bl kasan_early_init -#endif -#ifdef CONFIG_RELOCATABLE - mr r3,r30 - mr r4,r31 -#ifdef CONFIG_PHYS_64BIT - mr r5,r23 - mr r6,r25 -#else - mr r5,r25 -#endif - bl relocate_init -#endif - -#ifdef CONFIG_DYNAMIC_MEMSTART - lis r3,kernstart_addr@ha - la r3,kernstart_addr@l(r3) -#ifdef CONFIG_PHYS_64BIT - stw r23,0(r3) - stw r25,4(r3) -#else - stw r25,0(r3) -#endif -#endif - -/* - * Decide what sort of machine this is and initialize the MMU. - */ - mr r3,r30 - mr r4,r31 - bl machine_init - bl MMU_init - - /* Setup PTE pointers for the Abatron bdiGDB */ - lis r6, swapper_pg_dir@h - ori r6, r6, swapper_pg_dir@l - lis r5, abatron_pteptrs@h - ori r5, r5, abatron_pteptrs@l - lis r3, kernstart_virt_addr@ha - lwz r4, kernstart_virt_addr@l(r3) - stw r5, 0(r4) /* Save abatron_pteptrs at a fixed location */ - stw r6, 0(r5) - - /* Let's move on */ - lis r4,start_kernel@h - ori r4,r4,start_kernel@l - lis r3,MSR_KERNEL@h - ori r3,r3,MSR_KERNEL@l - mtspr SPRN_SRR0,r4 - mtspr SPRN_SRR1,r3 - rfi /* change context and jump to start_kernel */ - -/* Macros to hide the PTE size differences - * - * FIND_PTE -- walks the page tables given EA & pgdir pointer - * r10 -- EA of fault - * r11 -- PGDIR pointer - * r12 -- free - * label 2: is the bailout case - * - * if we find the pte (fall through): - * r11 is low pte word - * r12 is pointer to the pte - * r10 is the pshift from the PGD, if we're a hugepage - */ -#ifdef CONFIG_PTE_64BIT -#ifdef CONFIG_HUGETLB_PAGE -#define FIND_PTE \ - rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ - lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ - rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ - blt 1000f; /* Normal non-huge page */ \ - beq 2f; /* Bail if no table */ \ - oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ - andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ - xor r12, r10, r11; /* drop size bits from pointer */ \ - b 1001f; \ -1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ - li r10, 0; /* clear r10 */ \ -1001: lwz r11, 4(r12); /* Get pte entry */ -#else -#define FIND_PTE \ - rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ - lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ - rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ - beq 2f; /* Bail if no table */ \ - rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ - lwz r11, 4(r12); /* Get pte entry */ -#endif /* HUGEPAGE */ -#else /* !PTE_64BIT */ -#define FIND_PTE \ - rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ - lwz r11, 0(r11); /* Get L1 entry */ \ - rlwinm. r12, r11, 0, 0, 19; /* Extract L2 (pte) base address */ \ - beq 2f; /* Bail if no table */ \ - rlwimi r12, r10, 22, 20, 29; /* Compute PTE address */ \ - lwz r11, 0(r12); /* Get Linux PTE */ -#endif - -/* - * Interrupt vector entry code - * - * The Book E MMUs are always on so we don't need to handle - * interrupts in real mode as with previous PPC processors. In - * this case we handle interrupts in the kernel virtual address - * space. - * - * Interrupt vectors are dynamically placed relative to the - * interrupt prefix as determined by the address of interrupt_base. - * The interrupt vectors offsets are programmed using the labels - * for each interrupt vector entry. - * - * Interrupt vectors must be aligned on a 16 byte boundary. - * We align on a 32 byte cache line boundary for good measure. - */ - -interrupt_base: - /* Critical Input Interrupt */ - CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) - - /* Machine Check Interrupt */ - MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) - - /* Data Storage Interrupt */ - START_EXCEPTION(DataStorage) - NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE) - mfspr r5,SPRN_ESR /* Grab the ESR, save it */ - stw r5,_ESR(r11) - mfspr r4,SPRN_DEAR /* Grab the DEAR, save it */ - stw r4, _DEAR(r11) - andis. r10,r5,(ESR_ILK|ESR_DLK)@h - bne 1f - prepare_transfer_to_handler - bl do_page_fault - b interrupt_return -1: - prepare_transfer_to_handler - bl CacheLockingException - b interrupt_return - - /* Instruction Storage Interrupt */ - INSTRUCTION_STORAGE_EXCEPTION - - /* External Input Interrupt */ - EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ) - - /* Alignment Interrupt */ - ALIGNMENT_EXCEPTION - - /* Program Interrupt */ - PROGRAM_EXCEPTION - - /* Floating Point Unavailable Interrupt */ -#ifdef CONFIG_PPC_FPU - FP_UNAVAILABLE_EXCEPTION -#else - EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, unknown_exception) -#endif - - /* System Call Interrupt */ - START_EXCEPTION(SystemCall) - SYSCALL_ENTRY 0xc00 BOOKE_INTERRUPT_SYSCALL SPRN_SRR1 - - /* Auxiliary Processor Unavailable Interrupt */ - EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, unknown_exception) - - /* Decrementer Interrupt */ - DECREMENTER_EXCEPTION - - /* Fixed Internal Timer Interrupt */ - /* TODO: Add FIT support */ - EXCEPTION(0x3100, FIT, FixedIntervalTimer, unknown_exception) - - /* Watchdog Timer Interrupt */ -#ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException) -#else - CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception) -#endif - - /* Data TLB Error Interrupt */ - START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ - mfspr r10, SPRN_SPRG_THREAD - stw r11, THREAD_NORMSAVE(0)(r10) -#ifdef CONFIG_KVM_BOOKE_HV -BEGIN_FTR_SECTION - mfspr r11, SPRN_SRR1 -END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) -#endif - stw r12, THREAD_NORMSAVE(1)(r10) - stw r13, THREAD_NORMSAVE(2)(r10) - mfcr r13 - stw r13, THREAD_NORMSAVE(3)(r10) - DO_KVM BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1 -START_BTB_FLUSH_SECTION - mfspr r11, SPRN_SRR1 - andi. r10,r11,MSR_PR - beq 1f - BTB_FLUSH(r10) -1: -END_BTB_FLUSH_SECTION - mfspr r10, SPRN_DEAR /* Get faulting address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - lis r11, PAGE_OFFSET@h - cmplw 5, r10, r11 - blt 5, 3f - lis r11, swapper_pg_dir@h - ori r11, r11, swapper_pg_dir@l - - mfspr r12,SPRN_MAS1 /* Set TID to 0 */ - rlwinm r12,r12,0,16,1 - mtspr SPRN_MAS1,r12 - - b 4f - - /* Get the PGD for the current thread */ -3: - mfspr r11,SPRN_SPRG_THREAD - lwz r11,PGDIR(r11) - -#ifdef CONFIG_PPC_KUAP - mfspr r12, SPRN_MAS1 - rlwinm. r12,r12,0,0x3fff0000 - beq 2f /* KUAP fault */ -#endif - -4: - /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write - * to an RO page is pretty common, we don't do it with - * _PAGE_DIRTY. We could do it, but it's a fairly rare - * event so I'd rather take the overhead when it happens - * rather than adding an instruction here. We should measure - * whether the whole thing is worth it in the first place - * as we could avoid loading SPRN_ESR completely in the first - * place... - * - * TODO: Is it worth doing that mfspr & rlwimi in the first - * place or can we save a couple of instructions here ? - */ - mfspr r12,SPRN_ESR -#ifdef CONFIG_PTE_64BIT - li r13,_PAGE_PRESENT - oris r13,r13,_PAGE_ACCESSED@h -#else - li r13,_PAGE_PRESENT|_PAGE_ACCESSED -#endif - rlwimi r13,r12,11,29,29 - - FIND_PTE - andc. r13,r13,r11 /* Check permission */ - -#ifdef CONFIG_PTE_64BIT -#ifdef CONFIG_SMP - subf r13,r11,r12 /* create false data dep */ - lwzx r13,r11,r13 /* Get upper pte bits */ -#else - lwz r13,0(r12) /* Get upper pte bits */ -#endif -#endif - - bne 2f /* Bail if permission/valid mismatch */ - - /* Jump to common tlb load */ - b finish_tlb_load -2: - /* The bailout. Restore registers to pre-exception conditions - * and call the heavyweights to help us out. - */ - mfspr r10, SPRN_SPRG_THREAD - lwz r11, THREAD_NORMSAVE(3)(r10) - mtcr r11 - lwz r13, THREAD_NORMSAVE(2)(r10) - lwz r12, THREAD_NORMSAVE(1)(r10) - lwz r11, THREAD_NORMSAVE(0)(r10) - mfspr r10, SPRN_SPRG_RSCRATCH0 - b DataStorage - - /* Instruction TLB Error Interrupt */ - /* - * Nearly the same as above, except we get our - * information from different registers and bailout - * to a different point. - */ - START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ - mfspr r10, SPRN_SPRG_THREAD - stw r11, THREAD_NORMSAVE(0)(r10) -#ifdef CONFIG_KVM_BOOKE_HV -BEGIN_FTR_SECTION - mfspr r11, SPRN_SRR1 -END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) -#endif - stw r12, THREAD_NORMSAVE(1)(r10) - stw r13, THREAD_NORMSAVE(2)(r10) - mfcr r13 - stw r13, THREAD_NORMSAVE(3)(r10) - DO_KVM BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1 -START_BTB_FLUSH_SECTION - mfspr r11, SPRN_SRR1 - andi. r10,r11,MSR_PR - beq 1f - BTB_FLUSH(r10) -1: -END_BTB_FLUSH_SECTION - - mfspr r10, SPRN_SRR0 /* Get faulting address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - lis r11, PAGE_OFFSET@h - cmplw 5, r10, r11 - blt 5, 3f - lis r11, swapper_pg_dir@h - ori r11, r11, swapper_pg_dir@l - - mfspr r12,SPRN_MAS1 /* Set TID to 0 */ - rlwinm r12,r12,0,16,1 - mtspr SPRN_MAS1,r12 - - /* Make up the required permissions for kernel code */ -#ifdef CONFIG_PTE_64BIT - li r13,_PAGE_PRESENT | _PAGE_BAP_SX - oris r13,r13,_PAGE_ACCESSED@h -#else - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#endif - b 4f - - /* Get the PGD for the current thread */ -3: - mfspr r11,SPRN_SPRG_THREAD - lwz r11,PGDIR(r11) - -#ifdef CONFIG_PPC_KUAP - mfspr r12, SPRN_MAS1 - rlwinm. r12,r12,0,0x3fff0000 - beq 2f /* KUAP fault */ -#endif - - /* Make up the required permissions for user code */ -#ifdef CONFIG_PTE_64BIT - li r13,_PAGE_PRESENT | _PAGE_BAP_UX - oris r13,r13,_PAGE_ACCESSED@h -#else - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#endif - -4: - FIND_PTE - andc. r13,r13,r11 /* Check permission */ - -#ifdef CONFIG_PTE_64BIT -#ifdef CONFIG_SMP - subf r13,r11,r12 /* create false data dep */ - lwzx r13,r11,r13 /* Get upper pte bits */ -#else - lwz r13,0(r12) /* Get upper pte bits */ -#endif -#endif - - bne 2f /* Bail if permission mismatch */ - - /* Jump to common TLB load point */ - b finish_tlb_load - -2: - /* The bailout. Restore registers to pre-exception conditions - * and call the heavyweights to help us out. - */ - mfspr r10, SPRN_SPRG_THREAD - lwz r11, THREAD_NORMSAVE(3)(r10) - mtcr r11 - lwz r13, THREAD_NORMSAVE(2)(r10) - lwz r12, THREAD_NORMSAVE(1)(r10) - lwz r11, THREAD_NORMSAVE(0)(r10) - mfspr r10, SPRN_SPRG_RSCRATCH0 - b InstructionStorage - -/* Define SPE handlers for e500v2 */ -#ifdef CONFIG_SPE - /* SPE Unavailable */ - START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG(0x2010, SPE_UNAVAIL) - beq 1f - bl load_up_spe - b fast_exception_return -1: prepare_transfer_to_handler - bl KernelSPE - b interrupt_return -#elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, unknown_exception) -#endif /* CONFIG_SPE_POSSIBLE */ - - /* SPE Floating Point Data */ -#ifdef CONFIG_SPE - START_EXCEPTION(SPEFloatingPointData) - NORMAL_EXCEPTION_PROLOG(0x2030, SPE_FP_DATA) - prepare_transfer_to_handler - bl SPEFloatingPointException - REST_NVGPRS(r1) - b interrupt_return - - /* SPE Floating Point Round */ - START_EXCEPTION(SPEFloatingPointRound) - NORMAL_EXCEPTION_PROLOG(0x2050, SPE_FP_ROUND) - prepare_transfer_to_handler - bl SPEFloatingPointRoundException - REST_NVGPRS(r1) - b interrupt_return -#elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, unknown_exception) - EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, unknown_exception) -#endif /* CONFIG_SPE_POSSIBLE */ - - - /* Performance Monitor */ - EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ - performance_monitor_exception) - - EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception) - - CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ - CriticalDoorbell, unknown_exception) - - /* Debug Interrupt */ - DEBUG_DEBUG_EXCEPTION - DEBUG_CRIT_EXCEPTION - - GUEST_DOORBELL_EXCEPTION - - CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \ - unknown_exception) - - /* Hypercall */ - EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception) - - /* Embedded Hypervisor Privilege */ - EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception) - -interrupt_end: - -/* - * Local functions - */ - -/* - * Both the instruction and data TLB miss get to this - * point to load the TLB. - * r10 - tsize encoding (if HUGETLB_PAGE) or available to use - * r11 - TLB (info from Linux PTE) - * r12 - available to use - * r13 - upper bits of PTE (if PTE_64BIT) or available to use - * CR5 - results of addr >= PAGE_OFFSET - * MAS0, MAS1 - loaded with proper value when we get here - * MAS2, MAS3 - will need additional info from Linux PTE - * Upon exit, we reload everything and RFI. - */ -finish_tlb_load: -#ifdef CONFIG_HUGETLB_PAGE - cmpwi 6, r10, 0 /* check for huge page */ - beq 6, finish_tlb_load_cont /* !huge */ - - /* Alas, we need more scratch registers for hugepages */ - mfspr r12, SPRN_SPRG_THREAD - stw r14, THREAD_NORMSAVE(4)(r12) - stw r15, THREAD_NORMSAVE(5)(r12) - stw r16, THREAD_NORMSAVE(6)(r12) - stw r17, THREAD_NORMSAVE(7)(r12) - - /* Get the next_tlbcam_idx percpu var */ -#ifdef CONFIG_SMP - lwz r15, TASK_CPU-THREAD(r12) - lis r14, __per_cpu_offset@h - ori r14, r14, __per_cpu_offset@l - rlwinm r15, r15, 2, 0, 29 - lwzx r16, r14, r15 -#else - li r16, 0 -#endif - lis r17, next_tlbcam_idx@h - ori r17, r17, next_tlbcam_idx@l - add r17, r17, r16 /* r17 = *next_tlbcam_idx */ - lwz r15, 0(r17) /* r15 = next_tlbcam_idx */ - - lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */ - rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */ - mtspr SPRN_MAS0, r14 - - /* Extract TLB1CFG(NENTRY) */ - mfspr r16, SPRN_TLB1CFG - andi. r16, r16, 0xfff - - /* Update next_tlbcam_idx, wrapping when necessary */ - addi r15, r15, 1 - cmpw r15, r16 - blt 100f - lis r14, tlbcam_index@h - ori r14, r14, tlbcam_index@l - lwz r15, 0(r14) -100: stw r15, 0(r17) - - /* - * Calc MAS1_TSIZE from r10 (which has pshift encoded) - * tlb_enc = (pshift - 10). - */ - subi r15, r10, 10 - mfspr r16, SPRN_MAS1 - rlwimi r16, r15, 7, 20, 24 - mtspr SPRN_MAS1, r16 - - /* copy the pshift for use later */ - mr r14, r10 - - /* fall through */ - -#endif /* CONFIG_HUGETLB_PAGE */ - - /* - * We set execute, because we don't have the granularity to - * properly set this at the page level (Linux problem). - * Many of these bits are software only. Bits we don't set - * here we (properly should) assume have the appropriate value. - */ -finish_tlb_load_cont: -#ifdef CONFIG_PTE_64BIT - rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ - andi. r10, r11, _PAGE_DIRTY - bne 1f - li r10, MAS3_SW | MAS3_UW - andc r12, r12, r10 -1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ - rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ -2: mtspr SPRN_MAS3, r12 -BEGIN_MMU_FTR_SECTION - srwi r10, r13, 12 /* grab RPN[12:31] */ - mtspr SPRN_MAS7, r10 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) -#else - li r10, (_PAGE_EXEC | _PAGE_PRESENT) - mr r13, r11 - rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ - and r12, r11, r10 - andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ - slwi r10, r12, 1 - or r10, r10, r12 - rlwinm r10, r10, 0, ~_PAGE_EXEC /* Clear SX on user pages */ - iseleq r12, r12, r10 - rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ - mtspr SPRN_MAS3, r13 -#endif - - mfspr r12, SPRN_MAS2 -#ifdef CONFIG_PTE_64BIT - rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ -#else - rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ -#endif -#ifdef CONFIG_HUGETLB_PAGE - beq 6, 3f /* don't mask if page isn't huge */ - li r13, 1 - slw r13, r13, r14 - subi r13, r13, 1 - rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */ - andc r12, r12, r13 /* mask off ea bits within the page */ -#endif -3: mtspr SPRN_MAS2, r12 - -tlb_write_entry: - tlbwe - - /* Done...restore registers and get out of here. */ - mfspr r10, SPRN_SPRG_THREAD -#ifdef CONFIG_HUGETLB_PAGE - beq 6, 8f /* skip restore for 4k page faults */ - lwz r14, THREAD_NORMSAVE(4)(r10) - lwz r15, THREAD_NORMSAVE(5)(r10) - lwz r16, THREAD_NORMSAVE(6)(r10) - lwz r17, THREAD_NORMSAVE(7)(r10) -#endif -8: lwz r11, THREAD_NORMSAVE(3)(r10) - mtcr r11 - lwz r13, THREAD_NORMSAVE(2)(r10) - lwz r12, THREAD_NORMSAVE(1)(r10) - lwz r11, THREAD_NORMSAVE(0)(r10) - mfspr r10, SPRN_SPRG_RSCRATCH0 - rfi /* Force context change */ - -#ifdef CONFIG_SPE -/* Note that the SPE support is closely modeled after the AltiVec - * support. Changes to one are likely to be applicable to the - * other! */ -_GLOBAL(load_up_spe) -/* - * Disable SPE for the task which had SPE previously, - * and save its SPE registers in its thread_struct. - * Enables SPE for use in the kernel on return. - * On SMP we know the SPE units are free, since we give it up every - * switch. -- Kumar - */ - mfmsr r5 - oris r5,r5,MSR_SPE@h - mtmsr r5 /* enable use of SPE now */ - isync - /* enable use of SPE after return */ - oris r9,r9,MSR_SPE@h - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - li r4,1 - li r10,THREAD_ACC - stw r4,THREAD_USED_SPE(r5) - evlddx evr4,r10,r5 - evmra evr4,evr4 - REST_32EVRS(0,r10,r5,THREAD_EVR0) - blr - -/* - * SPE unavailable trap from kernel - print a message, but let - * the task use SPE in the kernel until it returns to user mode. - */ -KernelSPE: - lwz r3,_MSR(r1) - oris r3,r3,MSR_SPE@h - stw r3,_MSR(r1) /* enable use of SPE after return */ -#ifdef CONFIG_PRINTK - lis r3,87f@h - ori r3,r3,87f@l - mr r4,r2 /* current */ - lwz r5,_NIP(r1) - bl _printk -#endif - b interrupt_return -#ifdef CONFIG_PRINTK -87: .string "SPE used in kernel (task=%p, pc=%x) \n" -#endif - .align 4,0 - -#endif /* CONFIG_SPE */ - -/* - * Translate the effec addr in r3 to phys addr. The phys addr will be put - * into r3(higher 32bit) and r4(lower 32bit) - */ -get_phys_addr: - mfmsr r8 - mfspr r9,SPRN_PID - rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ - rlwimi r9,r8,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ - mtspr SPRN_MAS6,r9 - - tlbsx 0,r3 /* must succeed */ - - mfspr r8,SPRN_MAS1 - mfspr r12,SPRN_MAS3 - rlwinm r9,r8,25,0x1f /* r9 = log2(page size) */ - li r10,1024 - slw r10,r10,r9 /* r10 = page size */ - addi r10,r10,-1 - and r11,r3,r10 /* r11 = page offset */ - andc r4,r12,r10 /* r4 = page base */ - or r4,r4,r11 /* r4 = devtree phys addr */ -#ifdef CONFIG_PHYS_64BIT - mfspr r3,SPRN_MAS7 -#endif - blr - -/* - * Global functions - */ - -#ifdef CONFIG_E500 -#ifndef CONFIG_PPC_E500MC -/* Adjust or setup IVORs for e500v1/v2 */ -_GLOBAL(__setup_e500_ivors) - li r3,DebugCrit@l - mtspr SPRN_IVOR15,r3 - li r3,SPEUnavailable@l - mtspr SPRN_IVOR32,r3 - li r3,SPEFloatingPointData@l - mtspr SPRN_IVOR33,r3 - li r3,SPEFloatingPointRound@l - mtspr SPRN_IVOR34,r3 - li r3,PerformanceMonitor@l - mtspr SPRN_IVOR35,r3 - sync - blr -#else -/* Adjust or setup IVORs for e500mc */ -_GLOBAL(__setup_e500mc_ivors) - li r3,DebugDebug@l - mtspr SPRN_IVOR15,r3 - li r3,PerformanceMonitor@l - mtspr SPRN_IVOR35,r3 - li r3,Doorbell@l - mtspr SPRN_IVOR36,r3 - li r3,CriticalDoorbell@l - mtspr SPRN_IVOR37,r3 - sync - blr - -/* setup ehv ivors for */ -_GLOBAL(__setup_ehv_ivors) - li r3,GuestDoorbell@l - mtspr SPRN_IVOR38,r3 - li r3,CriticalGuestDoorbell@l - mtspr SPRN_IVOR39,r3 - li r3,Hypercall@l - mtspr SPRN_IVOR40,r3 - li r3,Ehvpriv@l - mtspr SPRN_IVOR41,r3 - sync - blr -#endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_E500 */ - -#ifdef CONFIG_SPE -/* - * extern void __giveup_spe(struct task_struct *prev) - * - */ -_GLOBAL(__giveup_spe) - addi r3,r3,THREAD /* want THREAD of task */ - lwz r5,PT_REGS(r3) - cmpi 0,r5,0 - SAVE_32EVRS(0, r4, r3, THREAD_EVR0) - evxor evr6, evr6, evr6 /* clear out evr6 */ - evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ - li r4,THREAD_ACC - evstddx evr6, r4, r3 /* save off accumulator */ - beq 1f - lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r3,MSR_SPE@h - andc r4,r4,r3 /* disable SPE for previous task */ - stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: - blr -#endif /* CONFIG_SPE */ - -/* - * extern void abort(void) - * - * At present, this routine just applies a system reset. - */ -_GLOBAL(abort) - li r13,0 - mtspr SPRN_DBCR0,r13 /* disable all debug events */ - isync - mfmsr r13 - ori r13,r13,MSR_DE@l /* Enable Debug Events */ - mtmsr r13 - isync - mfspr r13,SPRN_DBCR0 - lis r13,(DBCR0_IDM|DBCR0_RST_CHIP)@h - mtspr SPRN_DBCR0,r13 - isync - -#ifdef CONFIG_SMP -/* When we get here, r24 needs to hold the CPU # */ - .globl __secondary_start -__secondary_start: - LOAD_REG_ADDR_PIC(r3, tlbcam_index) - lwz r3,0(r3) - mtctr r3 - li r26,0 /* r26 safe? */ - - bl switch_to_as1 - mr r27,r3 /* tlb entry */ - /* Load each CAM entry */ -1: mr r3,r26 - bl loadcam_entry - addi r26,r26,1 - bdnz 1b - mr r3,r27 /* tlb entry */ - LOAD_REG_ADDR_PIC(r4, memstart_addr) - lwz r4,0(r4) - mr r5,r25 /* phys kernel start */ - rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ - subf r4,r5,r4 /* memstart_addr - phys kernel start */ - lis r7,KERNELBASE@h - ori r7,r7,KERNELBASE@l - cmpw r20,r7 /* if kernstart_virt_addr != KERNELBASE, randomized */ - beq 2f - li r4,0 -2: li r5,0 /* no device tree */ - li r6,0 /* not boot cpu */ - bl restore_to_as0 - - - lis r3,__secondary_hold_acknowledge@h - ori r3,r3,__secondary_hold_acknowledge@l - stw r24,0(r3) - - li r3,0 - mr r4,r24 /* Why? */ - bl call_setup_cpu - - /* get current's stack and current */ - lis r2,secondary_current@ha - lwz r2,secondary_current@l(r2) - lwz r1,TASK_STACK(r2) - - /* stack */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - li r0,0 - stw r0,0(r1) - - /* ptr to current thread */ - addi r4,r2,THREAD /* address of our thread_struct */ - mtspr SPRN_SPRG_THREAD,r4 - - /* Setup the defaults for TLB entries */ - li r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l - mtspr SPRN_MAS4,r4 - - /* Jump to start_secondary */ - lis r4,MSR_KERNEL@h - ori r4,r4,MSR_KERNEL@l - lis r3,start_secondary@h - ori r3,r3,start_secondary@l - mtspr SPRN_SRR0,r3 - mtspr SPRN_SRR1,r4 - sync - rfi - sync - - .globl __secondary_hold_acknowledge -__secondary_hold_acknowledge: - .long -1 -#endif - -/* - * Create a 64M tlb by address and entry - * r3 - entry - * r4 - virtual address - * r5/r6 - physical address - */ -_GLOBAL(create_kaslr_tlb_entry) - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ - mtspr SPRN_MAS0,r7 /* Write MAS0 */ - - lis r3,(MAS1_VALID|MAS1_IPROT)@h - ori r3,r3,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l - mtspr SPRN_MAS1,r3 /* Write MAS1 */ - - lis r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h - ori r3,r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l - and r3,r3,r4 - ori r3,r3,MAS2_M_IF_NEEDED@l - mtspr SPRN_MAS2,r3 /* Write MAS2(EPN) */ - -#ifdef CONFIG_PHYS_64BIT - ori r8,r6,(MAS3_SW|MAS3_SR|MAS3_SX) - mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ - mtspr SPRN_MAS7,r5 -#else - ori r8,r5,(MAS3_SW|MAS3_SR|MAS3_SX) - mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ -#endif - - tlbwe /* Write TLB */ - isync - sync - blr - -/* - * Return to the start of the relocated kernel and run again - * r3 - virtual address of fdt - * r4 - entry of the kernel - */ -_GLOBAL(reloc_kernel_entry) - mfmsr r7 - rlwinm r7, r7, 0, ~(MSR_IS | MSR_DS) - - mtspr SPRN_SRR0,r4 - mtspr SPRN_SRR1,r7 - rfi - -/* - * Create a tlb entry with the same effective and physical address as - * the tlb entry used by the current running code. But set the TS to 1. - * Then switch to the address space 1. It will return with the r3 set to - * the ESEL of the new created tlb. - */ -_GLOBAL(switch_to_as1) - mflr r5 - - /* Find a entry not used */ - mfspr r3,SPRN_TLB1CFG - andi. r3,r3,0xfff - mfspr r4,SPRN_PID - rlwinm r4,r4,16,0x3fff0000 /* turn PID into MAS6[SPID] */ - mtspr SPRN_MAS6,r4 -1: lis r4,0x1000 /* Set MAS0(TLBSEL) = 1 */ - addi r3,r3,-1 - rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r4 - tlbre - mfspr r4,SPRN_MAS1 - andis. r4,r4,MAS1_VALID@h - bne 1b - - /* Get the tlb entry used by the current running code */ - bcl 20,31,$+4 -0: mflr r4 - tlbsx 0,r4 - - mfspr r4,SPRN_MAS1 - ori r4,r4,MAS1_TS /* Set the TS = 1 */ - mtspr SPRN_MAS1,r4 - - mfspr r4,SPRN_MAS0 - rlwinm r4,r4,0,~MAS0_ESEL_MASK - rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r4 - tlbwe - isync - sync - - mfmsr r4 - ori r4,r4,MSR_IS | MSR_DS - mtspr SPRN_SRR0,r5 - mtspr SPRN_SRR1,r4 - sync - rfi - -/* - * Restore to the address space 0 and also invalidate the tlb entry created - * by switch_to_as1. - * r3 - the tlb entry which should be invalidated - * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0) - * r5 - device tree virtual address. If r4 is 0, r5 is ignored. - * r6 - boot cpu -*/ -_GLOBAL(restore_to_as0) - mflr r0 - - bcl 20,31,$+4 -0: mflr r9 - addi r9,r9,1f - 0b - - /* - * We may map the PAGE_OFFSET in AS0 to a different physical address, - * so we need calculate the right jump and device tree address based - * on the offset passed by r4. - */ - add r9,r9,r4 - add r5,r5,r4 - add r0,r0,r4 - -2: mfmsr r7 - li r8,(MSR_IS | MSR_DS) - andc r7,r7,r8 - - mtspr SPRN_SRR0,r9 - mtspr SPRN_SRR1,r7 - sync - rfi - - /* Invalidate the temporary tlb entry for AS1 */ -1: lis r9,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r9,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r9 - tlbre - mfspr r9,SPRN_MAS1 - rlwinm r9,r9,0,2,31 /* Clear MAS1 Valid and IPPROT */ - mtspr SPRN_MAS1,r9 - tlbwe - isync - - cmpwi r4,0 - cmpwi cr1,r6,0 - cror eq,4*cr1+eq,eq - bne 3f /* offset != 0 && is_boot_cpu */ - mtlr r0 - blr - - /* - * The PAGE_OFFSET will map to a different physical address, - * jump to _start to do another relocation again. - */ -3: mr r3,r5 - bl _start diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index a20deebf233f..1a1e9995dae3 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -47,7 +47,7 @@ static struct hard_trap_info { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ #ifdef CONFIG_BOOKE_OR_40x { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ -#if defined(CONFIG_FSL_BOOKE) +#if defined(CONFIG_PPC_85xx) { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */ { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */ @@ -57,7 +57,7 @@ static struct hard_trap_info { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */ { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */ { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */ -#else /* ! CONFIG_FSL_BOOKE */ +#else /* ! CONFIG_PPC_85xx */ { 0x1000, 0x0e /* SIGALRM */ }, /* prog interval timer */ { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */ { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */ @@ -208,7 +208,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) for (reg = 14; reg < 32; reg++) PACK64(ptr, regs->gpr[reg]); -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx #ifdef CONFIG_SPE for (reg = 0; reg < 32; reg++) PACK64(ptr, p->thread.evr[reg]); @@ -234,7 +234,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) #define GDB_SIZEOF_REG sizeof(unsigned long) #define GDB_SIZEOF_REG_U32 sizeof(u32) -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx #define GDB_SIZEOF_FLOAT_REG sizeof(unsigned long) #else #define GDB_SIZEOF_FLOAT_REG sizeof(u64) @@ -329,7 +329,7 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) if (regno >= 32 && regno < 64) { /* FP registers 32 -> 63 */ -#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_SPE) if (current) memcpy(mem, ¤t->thread.evr[regno-32], dbg_reg_def[regno].size); @@ -355,7 +355,7 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) if (regno >= 32 && regno < 64) { /* FP registers 32 -> 63 */ -#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_SPE) memcpy(¤t->thread.evr[regno-32], mem, dbg_reg_def[regno].size); #else diff --git a/arch/powerpc/kernel/swsusp_85xx.S b/arch/powerpc/kernel/swsusp_85xx.S new file mode 100644 index 000000000000..88cfdbd530f1 --- /dev/null +++ b/arch/powerpc/kernel/swsusp_85xx.S @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Based on swsusp_32.S, modified for FSL BookE by + * Anton Vorontsov + * Copyright (c) 2009-2010 MontaVista Software, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Structure for storing CPU registers on the save area. + */ +#define SL_SP 0 +#define SL_PC 4 +#define SL_MSR 8 +#define SL_TCR 0xc +#define SL_SPRG0 0x10 +#define SL_SPRG1 0x14 +#define SL_SPRG2 0x18 +#define SL_SPRG3 0x1c +#define SL_SPRG4 0x20 +#define SL_SPRG5 0x24 +#define SL_SPRG6 0x28 +#define SL_SPRG7 0x2c +#define SL_TBU 0x30 +#define SL_TBL 0x34 +#define SL_R2 0x38 +#define SL_CR 0x3c +#define SL_LR 0x40 +#define SL_R12 0x44 /* r12 to r31 */ +#define SL_SIZE (SL_R12 + 80) + + .section .data + .align 5 + +_GLOBAL(swsusp_save_area) + .space SL_SIZE + + + .section .text + .align 5 + +_GLOBAL(swsusp_arch_suspend) + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + + mflr r0 + stw r0,SL_LR(r11) + mfcr r0 + stw r0,SL_CR(r11) + stw r1,SL_SP(r11) + stw r2,SL_R2(r11) + stmw r12,SL_R12(r11) + + /* Save MSR & TCR */ + mfmsr r4 + stw r4,SL_MSR(r11) + mfspr r4,SPRN_TCR + stw r4,SL_TCR(r11) + + /* Get a stable timebase and save it */ +1: mfspr r4,SPRN_TBRU + stw r4,SL_TBU(r11) + mfspr r5,SPRN_TBRL + stw r5,SL_TBL(r11) + mfspr r3,SPRN_TBRU + cmpw r3,r4 + bne 1b + + /* Save SPRGs */ + mfspr r4,SPRN_SPRG0 + stw r4,SL_SPRG0(r11) + mfspr r4,SPRN_SPRG1 + stw r4,SL_SPRG1(r11) + mfspr r4,SPRN_SPRG2 + stw r4,SL_SPRG2(r11) + mfspr r4,SPRN_SPRG3 + stw r4,SL_SPRG3(r11) + mfspr r4,SPRN_SPRG4 + stw r4,SL_SPRG4(r11) + mfspr r4,SPRN_SPRG5 + stw r4,SL_SPRG5(r11) + mfspr r4,SPRN_SPRG6 + stw r4,SL_SPRG6(r11) + mfspr r4,SPRN_SPRG7 + stw r4,SL_SPRG7(r11) + + /* Call the low level suspend stuff (we should probably have made + * a stackframe... + */ + bl swsusp_save + + /* Restore LR from the save area */ + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + lwz r0,SL_LR(r11) + mtlr r0 + + blr + +_GLOBAL(swsusp_arch_resume) + sync + + /* Load ptr the list of pages to copy in r3 */ + lis r11,(restore_pblist)@h + ori r11,r11,restore_pblist@l + lwz r3,0(r11) + + /* Copy the pages. This is a very basic implementation, to + * be replaced by something more cache efficient */ +1: + li r0,256 + mtctr r0 + lwz r5,pbe_address(r3) /* source */ + lwz r6,pbe_orig_address(r3) /* destination */ +2: + lwz r8,0(r5) + lwz r9,4(r5) + lwz r10,8(r5) + lwz r11,12(r5) + addi r5,r5,16 + stw r8,0(r6) + stw r9,4(r6) + stw r10,8(r6) + stw r11,12(r6) + addi r6,r6,16 + bdnz 2b + lwz r3,pbe_next(r3) + cmpwi 0,r3,0 + bne 1b + + bl flush_dcache_L1 + bl flush_instruction_cache + + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + + /* + * Mappings from virtual addresses to physical addresses may be + * different than they were prior to restoring hibernation state. + * Invalidate the TLB so that the boot CPU is using the new + * mappings. + */ + bl _tlbil_all + + lwz r4,SL_SPRG0(r11) + mtspr SPRN_SPRG0,r4 + lwz r4,SL_SPRG1(r11) + mtspr SPRN_SPRG1,r4 + lwz r4,SL_SPRG2(r11) + mtspr SPRN_SPRG2,r4 + lwz r4,SL_SPRG3(r11) + mtspr SPRN_SPRG3,r4 + lwz r4,SL_SPRG4(r11) + mtspr SPRN_SPRG4,r4 + lwz r4,SL_SPRG5(r11) + mtspr SPRN_SPRG5,r4 + lwz r4,SL_SPRG6(r11) + mtspr SPRN_SPRG6,r4 + lwz r4,SL_SPRG7(r11) + mtspr SPRN_SPRG7,r4 + + /* restore the MSR */ + lwz r3,SL_MSR(r11) + mtmsr r3 + + /* Restore TB */ + li r3,0 + mtspr SPRN_TBWL,r3 + lwz r3,SL_TBU(r11) + lwz r4,SL_TBL(r11) + mtspr SPRN_TBWU,r3 + mtspr SPRN_TBWL,r4 + + /* Restore TCR and clear any pending bits in TSR. */ + lwz r4,SL_TCR(r11) + mtspr SPRN_TCR,r4 + lis r4, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h + mtspr SPRN_TSR,r4 + + /* Kick decrementer */ + li r0,1 + mtdec r0 + + /* Restore the callee-saved registers and return */ + lwz r0,SL_CR(r11) + mtcr r0 + lwz r2,SL_R2(r11) + lmw r12,SL_R12(r11) + lwz r1,SL_SP(r11) + lwz r0,SL_LR(r11) + mtlr r0 + + li r3,0 + blr diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S deleted file mode 100644 index 88cfdbd530f1..000000000000 --- a/arch/powerpc/kernel/swsusp_booke.S +++ /dev/null @@ -1,202 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Based on swsusp_32.S, modified for FSL BookE by - * Anton Vorontsov - * Copyright (c) 2009-2010 MontaVista Software, LLC. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Structure for storing CPU registers on the save area. - */ -#define SL_SP 0 -#define SL_PC 4 -#define SL_MSR 8 -#define SL_TCR 0xc -#define SL_SPRG0 0x10 -#define SL_SPRG1 0x14 -#define SL_SPRG2 0x18 -#define SL_SPRG3 0x1c -#define SL_SPRG4 0x20 -#define SL_SPRG5 0x24 -#define SL_SPRG6 0x28 -#define SL_SPRG7 0x2c -#define SL_TBU 0x30 -#define SL_TBL 0x34 -#define SL_R2 0x38 -#define SL_CR 0x3c -#define SL_LR 0x40 -#define SL_R12 0x44 /* r12 to r31 */ -#define SL_SIZE (SL_R12 + 80) - - .section .data - .align 5 - -_GLOBAL(swsusp_save_area) - .space SL_SIZE - - - .section .text - .align 5 - -_GLOBAL(swsusp_arch_suspend) - lis r11,swsusp_save_area@h - ori r11,r11,swsusp_save_area@l - - mflr r0 - stw r0,SL_LR(r11) - mfcr r0 - stw r0,SL_CR(r11) - stw r1,SL_SP(r11) - stw r2,SL_R2(r11) - stmw r12,SL_R12(r11) - - /* Save MSR & TCR */ - mfmsr r4 - stw r4,SL_MSR(r11) - mfspr r4,SPRN_TCR - stw r4,SL_TCR(r11) - - /* Get a stable timebase and save it */ -1: mfspr r4,SPRN_TBRU - stw r4,SL_TBU(r11) - mfspr r5,SPRN_TBRL - stw r5,SL_TBL(r11) - mfspr r3,SPRN_TBRU - cmpw r3,r4 - bne 1b - - /* Save SPRGs */ - mfspr r4,SPRN_SPRG0 - stw r4,SL_SPRG0(r11) - mfspr r4,SPRN_SPRG1 - stw r4,SL_SPRG1(r11) - mfspr r4,SPRN_SPRG2 - stw r4,SL_SPRG2(r11) - mfspr r4,SPRN_SPRG3 - stw r4,SL_SPRG3(r11) - mfspr r4,SPRN_SPRG4 - stw r4,SL_SPRG4(r11) - mfspr r4,SPRN_SPRG5 - stw r4,SL_SPRG5(r11) - mfspr r4,SPRN_SPRG6 - stw r4,SL_SPRG6(r11) - mfspr r4,SPRN_SPRG7 - stw r4,SL_SPRG7(r11) - - /* Call the low level suspend stuff (we should probably have made - * a stackframe... - */ - bl swsusp_save - - /* Restore LR from the save area */ - lis r11,swsusp_save_area@h - ori r11,r11,swsusp_save_area@l - lwz r0,SL_LR(r11) - mtlr r0 - - blr - -_GLOBAL(swsusp_arch_resume) - sync - - /* Load ptr the list of pages to copy in r3 */ - lis r11,(restore_pblist)@h - ori r11,r11,restore_pblist@l - lwz r3,0(r11) - - /* Copy the pages. This is a very basic implementation, to - * be replaced by something more cache efficient */ -1: - li r0,256 - mtctr r0 - lwz r5,pbe_address(r3) /* source */ - lwz r6,pbe_orig_address(r3) /* destination */ -2: - lwz r8,0(r5) - lwz r9,4(r5) - lwz r10,8(r5) - lwz r11,12(r5) - addi r5,r5,16 - stw r8,0(r6) - stw r9,4(r6) - stw r10,8(r6) - stw r11,12(r6) - addi r6,r6,16 - bdnz 2b - lwz r3,pbe_next(r3) - cmpwi 0,r3,0 - bne 1b - - bl flush_dcache_L1 - bl flush_instruction_cache - - lis r11,swsusp_save_area@h - ori r11,r11,swsusp_save_area@l - - /* - * Mappings from virtual addresses to physical addresses may be - * different than they were prior to restoring hibernation state. - * Invalidate the TLB so that the boot CPU is using the new - * mappings. - */ - bl _tlbil_all - - lwz r4,SL_SPRG0(r11) - mtspr SPRN_SPRG0,r4 - lwz r4,SL_SPRG1(r11) - mtspr SPRN_SPRG1,r4 - lwz r4,SL_SPRG2(r11) - mtspr SPRN_SPRG2,r4 - lwz r4,SL_SPRG3(r11) - mtspr SPRN_SPRG3,r4 - lwz r4,SL_SPRG4(r11) - mtspr SPRN_SPRG4,r4 - lwz r4,SL_SPRG5(r11) - mtspr SPRN_SPRG5,r4 - lwz r4,SL_SPRG6(r11) - mtspr SPRN_SPRG6,r4 - lwz r4,SL_SPRG7(r11) - mtspr SPRN_SPRG7,r4 - - /* restore the MSR */ - lwz r3,SL_MSR(r11) - mtmsr r3 - - /* Restore TB */ - li r3,0 - mtspr SPRN_TBWL,r3 - lwz r3,SL_TBU(r11) - lwz r4,SL_TBL(r11) - mtspr SPRN_TBWU,r3 - mtspr SPRN_TBWL,r4 - - /* Restore TCR and clear any pending bits in TSR. */ - lwz r4,SL_TCR(r11) - mtspr SPRN_TCR,r4 - lis r4, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h - mtspr SPRN_TSR,r4 - - /* Kick decrementer */ - li r0,1 - mtdec r0 - - /* Restore the callee-saved registers and return */ - lwz r0,SL_CR(r11) - mtcr r0 - lwz r2,SL_R2(r11) - lmw r12,SL_R12(r11) - lwz r1,SL_SP(r11) - lwz r0,SL_LR(r11) - mtlr r0 - - li r3,0 - blr diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dcf4046f8565..f181c434289e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2085,7 +2085,7 @@ DEFINE_INTERRUPT_HANDLER(altivec_assist_exception) } #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx DEFINE_INTERRUPT_HANDLER(CacheLockingException) { unsigned long error_code = regs->dsisr; @@ -2098,7 +2098,7 @@ DEFINE_INTERRUPT_HANDLER(CacheLockingException) _exception(SIGILL, regs, ILL_PRVOPC, regs->nip); return; } -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ #ifdef CONFIG_SPE DEFINE_INTERRUPT_HANDLER(SPEFloatingPointException) diff --git a/arch/powerpc/kexec/core_32.c b/arch/powerpc/kexec/core_32.c index b50aed48d09d..c95f96850c9e 100644 --- a/arch/powerpc/kexec/core_32.c +++ b/arch/powerpc/kexec/core_32.c @@ -55,7 +55,7 @@ void default_machine_kexec(struct kimage *image) reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); printk(KERN_INFO "Bye!\n"); - if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x)) + if (!IS_ENABLED(CONFIG_PPC_85xx) && !IS_ENABLED(CONFIG_44x)) relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start); /* now call it */ diff --git a/arch/powerpc/kexec/relocate_32.S b/arch/powerpc/kexec/relocate_32.S index cf6e52bdf8d8..d9f0dd9b34ff 100644 --- a/arch/powerpc/kexec/relocate_32.S +++ b/arch/powerpc/kexec/relocate_32.S @@ -25,14 +25,14 @@ relocate_new_kernel: /* r4 = reboot_code_buffer */ /* r5 = start_address */ -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx mr r29, r3 mr r30, r4 mr r31, r5 #define ENTRY_MAPPING_KEXEC_SETUP -#include +#include #undef ENTRY_MAPPING_KEXEC_SETUP mr r3, r29 diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 6fa82efe833b..205545d820a1 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -223,7 +223,7 @@ _GLOBAL(kvmppc_resume_host) lwz r3, VCPU_HOST_PID(r4) mtspr SPRN_PID, r3 -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* we cheat and know that Linux doesn't use PID1 which is always 0 */ lis r3, 0 mtspr SPRN_PID1, r3 @@ -406,7 +406,7 @@ lightweight_exit: lwz r3, VCPU_SHADOW_PID(r4) mtspr SPRN_PID, r3 -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx lwz r3, VCPU_SHADOW_PID1(r4) mtspr SPRN_PID1, r3 #endif diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3142d7617412..d4cc3749e621 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -85,12 +85,12 @@ void __init MMU_init(void) total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; lowmem_end_addr = memstart_addr + total_lowmem; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB * entries, so we need to adjust lowmem to match the amount we can map * in the fixed entries */ adjust_total_lowmem(); -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 0e3528aec49e..88805757d0c9 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -146,9 +146,9 @@ struct tlbcam { extern struct tlbcam TLBCAM[NUM_TLBCAMS]; #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx) /* 6xx have BATS */ -/* FSL_BOOKE have TLBCAM */ +/* PPC_85xx have TLBCAM */ /* 8xx have LTLB */ phys_addr_t v_block_mapped(unsigned long va); unsigned long p_block_mapped(phys_addr_t pa); diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c index c1ad173de318..40a4e69ae1a9 100644 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ b/arch/powerpc/mm/nohash/fsl_book3e.c @@ -59,7 +59,7 @@ static struct { phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* * Return PA for this VA if it is mapped by a CAM, or 0 */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 5e7ccb48b79c..f21896ebdc5a 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -130,7 +130,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ static inline int mmu_get_tsize(int psize) { diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index d62b613a0d5d..d378031246ab 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -221,7 +221,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) blr #endif /* CONFIG_PPC_47x */ -#elif defined(CONFIG_FSL_BOOKE) +#elif defined(CONFIG_PPC_85xx) /* * FSL BookE implementations. * diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 19fd95a06352..11780074eb23 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -324,11 +324,6 @@ config BOOKE_OR_40x depends on BOOKE || 40x default y -config FSL_BOOKE - bool - depends on E500 && PPC32 - default y - # this is for common code between PPC32 & PPC64 FSL BOOKE config PPC_FSL_BOOK3E bool @@ -337,7 +332,7 @@ config PPC_FSL_BOOK3E select PPC_SMP_MUXED_IPI select PPC_DOORBELL select PPC_KUEP - default y if FSL_BOOKE + default y if PPC_85xx config PTE_64BIT bool @@ -485,7 +480,7 @@ config PPC_MMU_NOHASH config PPC_BOOK3E_MMU def_bool y - depends on FSL_BOOKE || PPC_BOOK3E + depends on PPC_85xx || PPC_BOOK3E config PPC_HAVE_PMU_SUPPORT bool @@ -508,7 +503,7 @@ config FORCE_SMP select SMP config SMP - depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x + depends on PPC_BOOK3S || PPC_BOOK3E || PPC_85xx || PPC_47x select GENERIC_IRQ_MIGRATION bool "Symmetric multi-processing support" if !FORCE_SMP help -- cgit v1.2.3 From e0d68273d7069537701bb91c51d90d1e12aacc33 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:33 +0200 Subject: powerpc: Remove CONFIG_PPC_BOOK3E CONFIG_PPC_BOOK3E is redundant with CONFIG_PPC_BOOK3E_64. The later is more explicit about the fact that it's a 64 bits target. Remove CONFIG_PPC_BOOK3E. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5d0891490813c19cdcfc04678f512ea68cba3e64.1663606876.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/cputable.h | 4 ++-- arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/include/asm/nohash/pgalloc.h | 2 +- arch/powerpc/include/asm/paca.h | 8 +++---- arch/powerpc/include/asm/ppc_asm.h | 4 ++-- arch/powerpc/kernel/asm-offsets.c | 6 ++--- arch/powerpc/kernel/entry_64.S | 6 ++--- arch/powerpc/kernel/head_64.S | 40 +++++++++++++++---------------- arch/powerpc/kernel/misc_64.S | 6 ++--- arch/powerpc/kernel/paca.c | 6 ++--- arch/powerpc/kernel/setup.h | 2 +- arch/powerpc/kernel/setup_64.c | 8 +++---- arch/powerpc/kernel/vmlinux.lds.S | 2 +- arch/powerpc/kexec/core_64.c | 2 +- arch/powerpc/mm/mmu_decl.h | 6 ++--- arch/powerpc/mm/nohash/tlb_low.S | 2 +- arch/powerpc/platforms/85xx/Kconfig | 2 +- arch/powerpc/platforms/Kconfig.cputype | 10 +++----- arch/powerpc/xmon/xmon.c | 16 ++++++------- 20 files changed, 66 insertions(+), 70 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index dafb14f44672..9d721cac20b8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -548,7 +548,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" - depends on (PPC_BOOK3S || PPC_85xx || (44x && !SMP)) || PPC_BOOK3E + depends on (PPC_BOOK3S || PPC_85xx || (44x && !SMP)) || PPC_BOOK3E_64 select KEXEC_CORE help kexec is a system call that implements the ability to shutdown your diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index ae8c3e13cfce..27875f0b7bc7 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -463,7 +463,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTRS_COMPATIBLE (CPU_FTR_PPCAS_ARCH_V2) #ifdef CONFIG_PPC64 -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #define CPU_FTRS_POSSIBLE (CPU_FTRS_E6500 | CPU_FTRS_E5500) #else #ifdef CONFIG_CPU_LITTLE_ENDIAN @@ -521,7 +521,7 @@ enum { #endif /* __powerpc64__ */ #ifdef CONFIG_PPC64 -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #define CPU_FTRS_ALWAYS (CPU_FTRS_E6500 & CPU_FTRS_E5500) #else diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 8069dbc4b8d1..84a1cdc3204c 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -281,7 +281,7 @@ static inline bool nmi_disables_ftrace(struct pt_regs *regs) if (TRAP(regs) == INTERRUPT_PERFMON) return false; } - if (IS_ENABLED(CONFIG_PPC_BOOK3E)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3E_64)) { if (TRAP(regs) == INTERRUPT_PERFMON) return false; } diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index 29c43665a753..4b62376318e1 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -15,7 +15,7 @@ static inline void tlb_flush_pgtable(struct mmu_gather *tlb, { } -#endif /* !CONFIG_PPC_BOOK3E */ +#endif /* !CONFIG_PPC_BOOK3E_64 */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 3537b0500f4d..09f1790d0ae1 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -18,7 +18,7 @@ #include #include #include -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #include #else #include @@ -127,7 +127,7 @@ struct paca_struct { #endif #endif /* CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 u64 exgen[8] __aligned(0x40); /* Keep pgd in the same cacheline as the start of extlb */ pgd_t *pgd __aligned(0x40); /* Current PGD */ @@ -151,7 +151,7 @@ struct paca_struct { void *dbg_kstack; struct tlb_core_data tcd; -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ #ifdef CONFIG_PPC_64S_HASH_MMU unsigned char mm_ctx_low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; @@ -168,7 +168,7 @@ struct paca_struct { #ifdef CONFIG_PPC64 u64 exit_save_r1; /* Syscall/interrupt R1 save */ #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 u16 trap_save; /* Used when bad stack is encountered */ #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 83c02f5a7f2a..55149a0384db 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -709,7 +709,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) * kernel is built for. */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #define FIXUP_ENDIAN #else /* @@ -749,7 +749,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) .long 0x2402004c; /* hrfid */ \ 191: -#endif /* !CONFIG_PPC_BOOK3E */ +#endif /* !CONFIG_PPC_BOOK3E_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8c10f536e478..10ce03052a19 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -197,7 +197,7 @@ int main(void) OFFSET(PACAIRQHAPPENED, paca_struct, irq_happened); OFFSET(PACA_FTRACE_ENABLED, paca_struct, ftrace_enabled); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 OFFSET(PACAPGD, paca_struct, pgd); OFFSET(PACA_KERNELPGD, paca_struct, kernel_pgd); OFFSET(PACA_EXGEN, paca_struct, exgen); @@ -213,7 +213,7 @@ int main(void) OFFSET(TCD_ESEL_NEXT, tlb_core_data, esel_next); OFFSET(TCD_ESEL_MAX, tlb_core_data, esel_max); OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACA_EXGEN, paca_struct, exgen); @@ -248,7 +248,7 @@ int main(void) #ifdef CONFIG_PPC64 OFFSET(PACA_EXIT_SAVE_R1, paca_struct, exit_save_r1); #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save); #endif OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 01ace4c56104..3e2e37e6ecab 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -292,16 +292,16 @@ _GLOBAL(enter_prom) /* Prepare a 32-bit mode big endian MSR */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 rlwinm r11,r11,0,1,31 mtsrr1 r11 rfi -#else /* CONFIG_PPC_BOOK3E */ +#else /* CONFIG_PPC_BOOK3E_64 */ LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_LE) andc r11,r11,r12 mtsrr1 r11 RFI_TO_KERNEL -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ 1: /* Return from OF */ FIXUP_ENDIAN diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index cf2c08902c05..da544ea0ce01 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -143,7 +143,7 @@ DEFINE_FIXED_SYMBOL(__run_at_load, first_256B) .globl __secondary_hold __secondary_hold: FIXUP_ENDIAN -#ifndef CONFIG_PPC_BOOK3E +#ifndef CONFIG_PPC_BOOK3E_64 mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ @@ -160,7 +160,7 @@ __secondary_hold: sync li r26,0 -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 tovirt(r26,r26) #endif /* All secondary cpus wait here until told to start. */ @@ -169,7 +169,7 @@ __secondary_hold: beq 100b #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 tovirt(r12,r12) #endif mtctr r12 @@ -178,7 +178,7 @@ __secondary_hold: * it may be the case that other platforms have r4 right to * begin with, this gives us some safety in case it is not */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 mr r4,r25 #else li r4,0 @@ -214,7 +214,7 @@ USE_TEXT_SECTION() #include "interrupt_64.S" -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* * The booting_thread_hwid holds the thread id we want to boot in cpu * hotplug case. It is set by cpu hotplug code, and is invalid by default. @@ -322,7 +322,7 @@ _GLOBAL(fsl_secondary_thread_init) bl book3e_secondary_thread_init b generic_secondary_common_init -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ /* * On pSeries and most other platforms, secondary processors spin @@ -345,7 +345,7 @@ _GLOBAL(generic_secondary_smp_init) bl relative_toc tovirt(r2,r2) -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* Book3E initialization */ mr r3,r24 mr r4,r25 @@ -417,7 +417,7 @@ generic_secondary_common_init: b kexec_wait /* next kernel might do better */ 2: SET_PACA(r13) -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */ mtspr SPRN_SPRG_TLB_EXFRAME,r12 #endif @@ -519,7 +519,7 @@ __start_initialization_multiplatform: mr r29,r9 #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 bl start_initialization_book3e b __after_prom_start #else @@ -540,7 +540,7 @@ __start_initialization_multiplatform: /* Switch off MMU if not already off */ bl __mmu_off b __after_prom_start -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ __REF __boot_from_prom: @@ -587,11 +587,11 @@ __after_prom_start: /* process relocations for the final address of the kernel */ lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */ sldi r25,r25,32 -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) tovirt(r26,r26) /* on booke, we already run at PAGE_OFFSET */ #endif lwz r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26) -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) tophys(r26,r26) #endif cmplwi cr0,r7,1 /* flagged to stay where we are ? */ @@ -599,7 +599,7 @@ __after_prom_start: add r25,r25,r26 1: mr r3,r25 bl relocate -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) /* IVPR needs to be set after relocation. */ bl init_core_book3e #endif @@ -613,11 +613,11 @@ __after_prom_start: * Note: This process overwrites the OF exception vectors. */ li r3,0 /* target addr */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 tovirt(r3,r3) /* on booke, we already run at PAGE_OFFSET */ #endif mr. r4,r26 /* In some cases the loader may */ -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) tovirt(r4,r4) #endif beq 9f /* have already put us at zero */ @@ -630,14 +630,14 @@ __after_prom_start: * variable __run_at_load, if it is set the kernel is treated as relocatable * kernel, otherwise it will be moved to PHYSICAL_START */ -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) tovirt(r26,r26) /* on booke, we already run at PAGE_OFFSET */ #endif lwz r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26) cmplwi cr0,r7,1 bne 3f -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 LOAD_REG_ADDR(r5, __end_interrupts) LOAD_REG_ADDR(r11, _stext) sub r5,r5,r11 @@ -871,10 +871,10 @@ _GLOBAL(start_secondary_resume) */ enable_64b_mode: mfmsr r11 /* grab the current MSR */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ mtmsr r11 -#else /* CONFIG_PPC_BOOK3E */ +#else /* CONFIG_PPC_BOOK3E_64 */ LOAD_REG_IMMEDIATE(r12, MSR_64BIT) or r11,r11,r12 mtmsrd r11 @@ -940,7 +940,7 @@ start_here_multiplatform: std r29,8(r11); #endif -#ifndef CONFIG_PPC_BOOK3E +#ifndef CONFIG_PPC_BOOK3E_64 mfmsr r6 ori r6,r6,MSR_RI mtmsrd r6 /* RI on */ diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index fd6d8d3a548e..36184cada00b 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -286,7 +286,7 @@ kexec_flag: #ifdef CONFIG_KEXEC_CORE -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* * BOOK3E has no real MMU mode, so we have to setup the initial TLB * for a core to identity map v:0 to p:0. This current implementation @@ -354,7 +354,7 @@ _GLOBAL(kexec_smp_wait) * don't overwrite r3 here, it is live for kexec_wait above. */ real_mode: /* assume normal blr return */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* Create an identity mapping. */ b kexec_create_tlb #else @@ -413,7 +413,7 @@ _GLOBAL(kexec_sequence) lhz r25,PACAHWCPUID(r13) /* get our phys cpu from paca */ /* disable interrupts, we are overwriting kernel data next */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 wrteei 0 #else mfmsr r3 diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index dfd097b79160..be8db402e963 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -186,7 +186,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) #ifdef CONFIG_PPC_PSERIES new_paca->lppaca_ptr = NULL; #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 new_paca->kernel_pgd = swapper_pg_dir; #endif new_paca->lock_token = 0x8000; @@ -203,7 +203,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->slb_shadow_ptr = NULL; #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* For now -- if we have threads this will be adjusted later */ new_paca->tcd_ptr = &new_paca->tcd; #endif @@ -215,7 +215,7 @@ void setup_paca(struct paca_struct *new_paca) /* Setup r13 */ local_paca = new_paca; -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* On Book3E, initialize the TLB miss exception frames */ mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb); #else diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index 93f22da12abe..7912bb50a7cb 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -23,7 +23,7 @@ void check_smt_enabled(void); static inline void check_smt_enabled(void) { } #endif -#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +#if defined(CONFIG_PPC_BOOK3E_64) && defined(CONFIG_SMP) void setup_tlb_core_data(void); #else static inline void setup_tlb_core_data(void) { } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 2b2d0b0fbb30..6434a3f6acb5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -86,7 +86,7 @@ struct ppc64_caches ppc64_caches = { }; EXPORT_SYMBOL_GPL(ppc64_caches); -#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +#if defined(CONFIG_PPC_BOOK3E_64) && defined(CONFIG_SMP) void __init setup_tlb_core_data(void) { int cpu; @@ -673,7 +673,7 @@ void __init initialize_cache_info(void) */ __init u64 ppc64_bolted_size(void) { -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* Freescale BookE bolts the entire linear mapping */ /* XXX: BookE ppc64_rma_limit setup seems to disagree? */ if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) @@ -723,7 +723,7 @@ void __init irqstack_early_init(void) } } -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 void __init exc_lvl_early_init(void) { unsigned int i; @@ -825,7 +825,7 @@ void __init setup_per_cpu_areas(void) /* * BookE and BookS radix are historical values and should be revisited. */ - if (IS_ENABLED(CONFIG_PPC_BOOK3E)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3E_64)) { atom_size = SZ_1M; } else if (radix_enabled()) { atom_size = PAGE_SIZE; diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index e68eb9381066..b60d81acccfc 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -71,7 +71,7 @@ SECTIONS .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index c2bea9db1c1e..a79e28c91e2b 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) * the RMA. On BookE there is no real MMU off mode, so we have to * keep it enabled as well (but then we have bolted TLB entries). */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 copy_with_mmu_off = false; #else copy_with_mmu_off = radix_enabled() || diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 88805757d0c9..341c2e0c71d2 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) @@ -55,7 +55,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); trace_tlbie(0, 0, address, pid, 0, 0, 0); } -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) extern void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, } #endif /* CONFIG_PPC_8xx */ -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index d378031246ab..6914bc8e4ead 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -294,7 +294,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) /* * New Book3E (>= 2.06) implementation * diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index be16eba0f704..069628670a0c 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 menuconfig FSL_SOC_BOOKE bool "Freescale Book-E Machine Type" - depends on PPC_85xx || PPC_BOOK3E + depends on PPC_85xx || PPC_BOOK3E_64 select FSL_SOC select PPC_UDBG_16550 select MPIC diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 11780074eb23..d63b6386a974 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -257,10 +257,6 @@ config PPC_BOOK3S def_bool y depends on PPC_BOOK3S_32 || PPC_BOOK3S_64 -config PPC_BOOK3E - def_bool y - depends on PPC_BOOK3E_64 - config E500 select FSL_EMB_PERFMON select PPC_FSL_BOOK3E @@ -316,7 +312,7 @@ config 4xx config BOOKE bool - depends on E500 || 44x || PPC_BOOK3E + depends on E500 || 44x || PPC_BOOK3E_64 default y config BOOKE_OR_40x @@ -480,7 +476,7 @@ config PPC_MMU_NOHASH config PPC_BOOK3E_MMU def_bool y - depends on PPC_85xx || PPC_BOOK3E + depends on PPC_85xx || PPC_BOOK3E_64 config PPC_HAVE_PMU_SUPPORT bool @@ -503,7 +499,7 @@ config FORCE_SMP select SMP config SMP - depends on PPC_BOOK3S || PPC_BOOK3E || PPC_85xx || PPC_47x + depends on PPC_BOOK3S || PPC_BOOK3E_64 || PPC_85xx || PPC_47x select GENERIC_IRQ_MIGRATION bool "Symmetric multi-processing support" if !FORCE_SMP help diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 26ef3388c24c..e6d678d27b0f 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -195,7 +195,7 @@ static int do_spu_cmd(void); #ifdef CONFIG_44x static void dump_tlb_44x(void); #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 static void dump_tlb_book3e(void); #endif @@ -288,11 +288,11 @@ Commands:\n\ t print backtrace\n\ x exit monitor and recover\n\ X exit monitor and don't recover\n" -#if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E_64) " u dump segment table or SLB\n" #elif defined(CONFIG_PPC_BOOK3S_32) " u dump segment registers\n" -#elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E_64) " u dump TLB\n" #endif " U show uptime information\n" @@ -1166,7 +1166,7 @@ cmds(struct pt_regs *excp) case 'u': dump_tlb_44x(); break; -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) case 'u': dump_tlb_book3e(); break; @@ -2686,7 +2686,7 @@ static void dump_one_paca(int cpu) DUMP(p, rfi_flush_fallback_area, "%-*px"); #endif DUMP(p, dscr_default, "%#-*llx"); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 DUMP(p, pgd, "%-*px"); DUMP(p, kernel_pgd, "%-*px"); DUMP(p, tcd_ptr, "%-*px"); @@ -2701,7 +2701,7 @@ static void dump_one_paca(int cpu) DUMP(p, canary, "%#-*lx"); #endif DUMP(p, saved_r1, "%#-*llx"); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 DUMP(p, trap_save, "%#-*x"); #endif DUMP(p, irq_soft_mask, "%#-*x"); @@ -3823,7 +3823,7 @@ static void dump_tlb_44x(void) } #endif /* CONFIG_44x */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 static void dump_tlb_book3e(void) { u32 mmucfg, pidmask, lpidmask; @@ -3965,7 +3965,7 @@ static void dump_tlb_book3e(void) } } } -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ static void xmon_init(int enable) { -- cgit v1.2.3 From 688de017efaab8a7764ab2c05ce7128d0361023b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:35 +0200 Subject: powerpc: Change CONFIG_E500 to CONFIG_PPC_E500 It will be used outside arch/powerpc, make it clear its a powerpc configuration item. And we already have CONFIG_PPC_E500MC, so that will make it more consistent. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e63b22083c11c4300f4a82d3123a46e5fdd54fa6.1663606876.git.christophe.leroy@csgroup.eu --- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/cputable.h | 4 ++-- arch/powerpc/include/asm/kgdb.h | 2 +- arch/powerpc/include/asm/mmu.h | 4 ++-- arch/powerpc/include/asm/reg_booke.h | 6 +++--- arch/powerpc/include/asm/synch.h | 2 +- arch/powerpc/include/asm/vdso/timebase.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/cpu_setup_fsl_booke.S | 4 ++-- arch/powerpc/kernel/entry_32.S | 4 ++-- arch/powerpc/kernel/head_85xx.S | 4 ++-- arch/powerpc/kernel/head_booke.h | 2 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/kvm/Kconfig | 4 ++-- arch/powerpc/platforms/Kconfig.cputype | 26 +++++++++++++------------- arch/powerpc/sysdev/fsl_pci.c | 2 +- arch/powerpc/sysdev/fsl_rio.c | 2 +- 18 files changed, 38 insertions(+), 38 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index f6d477c4aa64..cb01832385d0 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -210,7 +210,7 @@ KBUILD_CFLAGS += $(call cc-option,-mno-string) cpu-as-$(CONFIG_40x) += -Wa,-m405 cpu-as-$(CONFIG_44x) += -Wa,-m440 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) -cpu-as-$(CONFIG_E500) += -Wa,-me500 +cpu-as-$(CONFIG_PPC_E500) += -Wa,-me500 # When using '-many -mpower4' gas will first try and find a matching power4 # mnemonic and failing that it will allow any valid mnemonic that GAS knows diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 27875f0b7bc7..757dbded11dc 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -510,7 +510,7 @@ enum { #elif defined(CONFIG_44x) CPU_FTRS_44X | CPU_FTRS_440x6 | #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 CPU_FTRS_E500 | CPU_FTRS_E500_2 | #endif #ifdef CONFIG_PPC_E500MC @@ -584,7 +584,7 @@ enum { #elif defined(CONFIG_44x) CPU_FTRS_44X & CPU_FTRS_440x6 & #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 CPU_FTRS_E500 & CPU_FTRS_E500_2 & #endif #ifdef CONFIG_PPC_E500MC diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h index a9e098a3b881..715c18b75334 100644 --- a/arch/powerpc/include/asm/kgdb.h +++ b/arch/powerpc/include/asm/kgdb.h @@ -52,7 +52,7 @@ static inline void arch_kgdb_breakpoint(void) /* On non-E500 family PPC32 we determine the size by picking the last * register we need, but on E500 we skip sections so we list what we * need to store, and add it up. */ -#ifndef CONFIG_E500 +#ifndef CONFIG_PPC_E500 #define MAXREG (PT_FPSCR+1) #else /* 32 GPRs (8 bytes), nip, msr, ccr, link, ctr, xer, acc (8 bytes), spefscr*/ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 860d0290ca4d..5b46da9ba7f6 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -162,7 +162,7 @@ enum { #elif defined(CONFIG_44x) MMU_FTR_TYPE_44x | #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX | #endif #ifdef CONFIG_PPC_BOOK3S_32 @@ -211,7 +211,7 @@ enum { #elif defined(CONFIG_44x) #define MMU_FTRS_ALWAYS MMU_FTR_TYPE_44x #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #define MMU_FTRS_ALWAYS MMU_FTR_TYPE_FSL_E #endif diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 17b8dcd9a40d..af56980b6cdb 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -246,7 +246,7 @@ #define PPC47x_MCSR_FPR 0x00800000 /* FPR parity error */ #define PPC47x_MCSR_IPR 0x00400000 /* Imprecise Machine Check Exception */ -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 /* All e500 */ #define MCSR_MCP 0x80000000UL /* Machine Check Input Pin */ #define MCSR_ICPERR 0x40000000UL /* I-Cache Parity Error */ @@ -282,7 +282,7 @@ #endif /* Bit definitions for the HID1 */ -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 /* e500v1/v2 */ #define HID1_PLL_CFG_MASK 0xfc000000 /* PLL_CFG input pins */ #define HID1_RFXE 0x00020000 /* Read fault exception enable */ @@ -545,7 +545,7 @@ #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ #define TCR_ARE 0x00400000 /* Auto Reload Enable */ -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ (((tcr) & 0x1E0000) >> 15)) #else diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index 7130176d8cb8..b0b4c64870d7 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -44,7 +44,7 @@ static inline void ppc_after_tlbiel_barrier(void) #if defined(__powerpc64__) # define LWSYNC lwsync -#elif defined(CONFIG_E500) +#elif defined(CONFIG_PPC_E500) # define LWSYNC \ START_LWSYNC_SECTION(96); \ sync; \ diff --git a/arch/powerpc/include/asm/vdso/timebase.h b/arch/powerpc/include/asm/vdso/timebase.h index 891c9d5eaabe..e9245f86a46c 100644 --- a/arch/powerpc/include/asm/vdso/timebase.h +++ b/arch/powerpc/include/asm/vdso/timebase.h @@ -12,7 +12,7 @@ * We use __powerpc64__ here because we want the compat VDSO to use the 32-bit * version below in the else case of the ifdef. */ -#if defined(__powerpc64__) && (defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)) +#if defined(__powerpc64__) && (defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_E500)) #define mftb() ({unsigned long rval; \ asm volatile( \ "90: mfspr %0, %2;\n" \ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 4483cae7dc9f..33dafd12e81d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -101,7 +101,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o ifdef CONFIG_PPC32 -obj-$(CONFIG_E500) += idle_e500.o +obj-$(CONFIG_PPC_E500) += idle_e500.o endif obj-$(CONFIG_PPC_BOOK3S_32) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index 4bf33f1b4193..058336079069 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -108,7 +108,7 @@ _GLOBAL(__setup_cpu_e6500) #endif /* CONFIG_PPC_E500MC */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #ifndef CONFIG_PPC_E500MC _GLOBAL(__setup_cpu_e500v1) _GLOBAL(__setup_cpu_e500v2) @@ -156,7 +156,7 @@ _GLOBAL(__setup_cpu_e5500) mtlr r5 blr #endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_E500 */ +#endif /* CONFIG_PPC_E500 */ #endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC_BOOK3E_64 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1d599df6f169..e6d5fe3a8585 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -49,7 +49,7 @@ */ .align 12 -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_E500) .globl prepare_transfer_to_handler prepare_transfer_to_handler: /* if from kernel, check interrupted DOZE/NAP mode */ @@ -71,7 +71,7 @@ prepare_transfer_to_handler: lwz r2, GPR2(r11) b fast_exception_return _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) -#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */ +#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_PPC_E500 */ #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) .globl __kuep_lock diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 48b168b5dc57..52c0ab416326 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -912,7 +912,7 @@ get_phys_addr: * Global functions */ -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #ifndef CONFIG_PPC_E500MC /* Adjust or setup IVORs for e500v1/v2 */ _GLOBAL(__setup_e500_ivors) @@ -955,7 +955,7 @@ _GLOBAL(__setup_ehv_ivors) sync blr #endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_E500 */ +#endif /* CONFIG_PPC_E500 */ #ifdef CONFIG_SPE /* diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index bb6d5d0fc4ac..a2f82ced6e4a 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -103,7 +103,7 @@ END_BTB_FLUSH_SECTION .endm .macro prepare_transfer_to_handler -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 andi. r12,r9,MSR_PR bne 777f bl prepare_transfer_to_handler diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 813261789303..b761cc1a403c 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -207,7 +207,7 @@ void __init setup_power_save(void) ppc_md.power_save = ppc6xx_idle; #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 if (cpu_has_feature(CPU_FTR_CAN_DOZE) || cpu_has_feature(CPU_FTR_CAN_NAP)) ppc_md.power_save = e500_idle; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f181c434289e..62ec50a7a8ef 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -600,7 +600,7 @@ static inline int check_io_access(struct pt_regs *regs) #define inst_length(reason) (((reason) & REASON_PREFIXED) ? 8 : 4) -#if defined(CONFIG_E500) +#if defined(CONFIG_PPC_E500) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index dcb398d5e009..61cdd782d3c5 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -189,7 +189,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" - depends on E500 && !PPC_E500MC + depends on PPC_E500 && !PPC_E500MC select KVM select KVM_MMIO select MMU_NOTIFIER @@ -220,7 +220,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" - depends on KVM && E500 + depends on KVM && PPC_E500 select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d63b6386a974..5b065186ace5 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -33,7 +33,7 @@ config PPC_BOOK3S_32 config PPC_85xx bool "Freescale 85xx" - select E500 + select PPC_E500 config PPC_8xx bool "Freescale 8xx" @@ -108,7 +108,7 @@ config PPC_BOOK3S_64 config PPC_BOOK3E_64 bool "Embedded processors" select PPC_FSL_BOOK3E - select E500 + select PPC_E500 select PPC_E500MC select PPC_FPU # Make it a choice ? select PPC_SMP_MUXED_IPI @@ -175,11 +175,11 @@ config POWER9_CPU config E5500_CPU bool "Freescale e5500" - depends on PPC64 && E500 + depends on PPC64 && PPC_E500 config E6500_CPU bool "Freescale e6500" - depends on PPC64 && E500 + depends on PPC64 && PPC_E500 config 405_CPU bool "40x family" @@ -257,7 +257,7 @@ config PPC_BOOK3S def_bool y depends on PPC_BOOK3S_32 || PPC_BOOK3S_64 -config E500 +config PPC_E500 select FSL_EMB_PERFMON select PPC_FSL_BOOK3E bool @@ -266,7 +266,7 @@ config PPC_E500MC bool "e500mc Support" select PPC_FPU select COMMON_CLK - depends on E500 + depends on PPC_E500 help This must be enabled for running on e500mc (and derivatives such as e5500/e6500), and must be disabled for running on @@ -289,7 +289,7 @@ config PPC_FPU config FSL_EMB_PERFMON bool "Freescale Embedded Perfmon" - depends on E500 || PPC_83xx + depends on PPC_E500 || PPC_83xx help This is the Performance Monitor support found on the e500 core and some e300 cores (c3 and c4). Select this only if your @@ -302,7 +302,7 @@ config FSL_EMB_PERF_EVENT config FSL_EMB_PERF_EVENT_E500 bool - depends on FSL_EMB_PERF_EVENT && E500 + depends on FSL_EMB_PERF_EVENT && PPC_E500 default y config 4xx @@ -312,7 +312,7 @@ config 4xx config BOOKE bool - depends on E500 || 44x || PPC_BOOK3E_64 + depends on PPC_E500 || 44x || PPC_BOOK3E_64 default y config BOOKE_OR_40x @@ -332,12 +332,12 @@ config PPC_FSL_BOOK3E config PTE_64BIT bool - depends on 44x || E500 || PPC_86xx + depends on 44x || PPC_E500 || PPC_86xx default y if PHYS_64BIT config PHYS_64BIT - bool 'Large physical address support' if E500 || PPC_86xx - depends on (44x || E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx + bool 'Large physical address support' if PPC_E500 || PPC_86xx + depends on (44x || PPC_E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx select PHYS_ADDR_T_64BIT help This option enables kernel support for larger than 32-bit physical @@ -384,7 +384,7 @@ config VSX config SPE_POSSIBLE def_bool y - depends on E500 && !PPC_E500MC + depends on PPC_E500 && !PPC_E500MC config SPE bool "SPE Support" diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 1ef7400ef244..974d3db6faab 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -943,7 +943,7 @@ u64 fsl_pci_immrbar_base(struct pci_controller *hose) return 0; } -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 static int mcheck_handle_load(struct pt_regs *regs, u32 inst) { unsigned int rd, ra, rb, d; diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 4647c6074f3b..c8f044d62fe2 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -98,7 +98,7 @@ resource_size_t rio_law_start; struct fsl_rio_dbell *dbell; struct fsl_rio_pw *pw; -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 int fsl_rio_mcheck_exception(struct pt_regs *regs) { const struct exception_table_entry *entry; -- cgit v1.2.3 From 3e7318584dfec11992f3ac45658c4bc1210b3778 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:38 +0200 Subject: powerpc: Remove CONFIG_PPC_FSL_BOOK3E CONFIG_PPC_FSL_BOOK3E is redundant with CONFIG_PPC_E500. Remove it. And rename five files accordingly. Signed-off-by: Christophe Leroy [mpe: Rename include guards to match new file names] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/795cb93b88c9a0279289712e674f39e3b108a1b4.1663606876.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/barrier.h | 2 +- arch/powerpc/include/asm/hugetlb.h | 4 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/mmu.h | 2 +- arch/powerpc/include/asm/nohash/32/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/hugetlb-book3e.h | 45 --- arch/powerpc/include/asm/nohash/hugetlb-e500.h | 45 +++ arch/powerpc/include/asm/nohash/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/pte-book3e.h | 129 -------- arch/powerpc/include/asm/nohash/pte-e500.h | 129 ++++++++ arch/powerpc/include/asm/page.h | 2 +- arch/powerpc/include/asm/ppc_asm.h | 6 +- arch/powerpc/include/asm/setup.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/asm-offsets.c | 4 +- arch/powerpc/kernel/cpu_setup_e500.S | 333 ++++++++++++++++++++ arch/powerpc/kernel/cpu_setup_fsl_booke.S | 333 -------------------- arch/powerpc/kernel/head_booke.h | 2 +- arch/powerpc/kernel/interrupt_64.S | 2 +- arch/powerpc/kernel/security.c | 10 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kernel/sysfs.c | 6 +- arch/powerpc/kernel/vmlinux.lds.S | 2 +- arch/powerpc/lib/feature-fixups.c | 4 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/powerpc/mm/mmu_decl.h | 4 +- arch/powerpc/mm/nohash/Makefile | 6 +- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 190 ------------ arch/powerpc/mm/nohash/e500.c | 375 +++++++++++++++++++++++ arch/powerpc/mm/nohash/e500_hugetlbpage.c | 190 ++++++++++++ arch/powerpc/mm/nohash/fsl_book3e.c | 375 ----------------------- arch/powerpc/mm/nohash/tlb.c | 16 +- arch/powerpc/mm/nohash/tlb_low.S | 2 +- arch/powerpc/platforms/Kconfig.cputype | 16 +- 37 files changed, 1123 insertions(+), 1131 deletions(-) delete mode 100644 arch/powerpc/include/asm/nohash/hugetlb-book3e.h create mode 100644 arch/powerpc/include/asm/nohash/hugetlb-e500.h delete mode 100644 arch/powerpc/include/asm/nohash/pte-book3e.h create mode 100644 arch/powerpc/include/asm/nohash/pte-e500.h create mode 100644 arch/powerpc/kernel/cpu_setup_e500.S delete mode 100644 arch/powerpc/kernel/cpu_setup_fsl_booke.S delete mode 100644 arch/powerpc/mm/nohash/book3e_hugetlbpage.c create mode 100644 arch/powerpc/mm/nohash/e500.c create mode 100644 arch/powerpc/mm/nohash/e500_hugetlbpage.c delete mode 100644 arch/powerpc/mm/nohash/fsl_book3e.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9d721cac20b8..a7b58645cc3f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -290,7 +290,7 @@ config PPC_LONG_DOUBLE_128 config PPC_BARRIER_NOSPEC bool default y - depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E + depends on PPC_BOOK3S_64 || PPC_E500 config EARLY_PRINTK bool diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index ef2d8b15eaab..e80b2c0e9315 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -86,7 +86,7 @@ do { \ #ifdef CONFIG_PPC_BOOK3S_64 #define NOSPEC_BARRIER_SLOT nop -#elif defined(CONFIG_PPC_FSL_BOOK3E) +#elif defined(CONFIG_PPC_E500) #define NOSPEC_BARRIER_SLOT nop; nop #endif diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 32ce0fb7548f..ea71f7245a63 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -7,8 +7,8 @@ #ifdef CONFIG_PPC_BOOK3S_64 #include -#elif defined(CONFIG_PPC_FSL_BOOK3E) -#include +#elif defined(CONFIG_PPC_E500) +#include #elif defined(CONFIG_PPC_8xx) #include #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c2b003550dc9..caea15dcb91d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -443,7 +443,7 @@ struct kvmppc_passthru_irqmap { }; #endif -# ifdef CONFIG_PPC_FSL_BOOK3E +# ifdef CONFIG_PPC_E500 #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 # else diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 5b46da9ba7f6..39057320e436 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -141,7 +141,7 @@ typedef pte_t *pgtable_t; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 #include DECLARE_PER_CPU(int, next_tlbcam_idx); #endif diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 197e7552d9f6..0d40b33184eb 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -131,7 +131,7 @@ void unmap_kernel_page(unsigned long va); #elif defined(CONFIG_44x) #include #elif defined(CONFIG_PPC_85xx) && defined(CONFIG_PTE_64BIT) -#include +#include #elif defined(CONFIG_PPC_85xx) #include #elif defined(CONFIG_PPC_8xx) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 599921cc257e..879e9a6e5a87 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -70,7 +70,7 @@ /* * Include the PTE bits definitions */ -#include +#include #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h deleted file mode 100644 index ecd8694cb229..000000000000 --- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H -#define _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H - -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - if (WARN_ON(!hugepd_ok(hpd))) - return NULL; - - return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return hpd_val(hpd) & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned int pdshift) -{ - /* - * On FSL BookE, we have multiple higher-level table entries that - * point to the same hugepte. Just use the first one since they're all - * identical. So for that case, idx=0. - */ - return hugepd_page(hpd); -} - -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); - -static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) -{ - /* We use the old format for PPC_FSL_BOOK3E */ - *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); -} - -static inline int check_and_get_huge_psize(int shift) -{ - if (shift & 1) /* Not a power of 4 */ - return -EINVAL; - - return shift_to_mmu_psize(shift); -} - -#endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h b/arch/powerpc/include/asm/nohash/hugetlb-e500.h new file mode 100644 index 000000000000..8f04ad20e040 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H +#define _ASM_POWERPC_NOHASH_HUGETLB_E500_H + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + if (WARN_ON(!hugepd_ok(hpd))) + return NULL; + + return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return hpd_val(hpd) & HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, + unsigned int pdshift) +{ + /* + * On FSL BookE, we have multiple higher-level table entries that + * point to the same hugepte. Just use the first one since they're all + * identical. So for that case, idx=0. + */ + return hugepd_page(hpd); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); + +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + /* We use the old format for PPC_E500 */ + *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); +} + +static inline int check_and_get_huge_psize(int shift) +{ + if (shift & 1) /* Not a power of 4 */ + return -EINVAL; + + return shift_to_mmu_psize(shift); +} + +#endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 4fd73c7412d0..d9067dfc531c 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -266,7 +266,7 @@ static inline int pud_huge(pud_t pud) * We use it to ensure coherency between the i-cache and d-cache * for the page which has just been mapped in. */ -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_PPC_E500) && defined(CONFIG_HUGETLB_PAGE) void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #else static inline diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h deleted file mode 100644 index f798640422c2..000000000000 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_PTE_BOOK3E_H -#define _ASM_POWERPC_NOHASH_PTE_BOOK3E_H -#ifdef __KERNEL__ - -/* PTE bit definitions for processors compliant to the Book3E - * architecture 2.06 or later. The position of the PTE bits - * matches the HW definition of the optional Embedded Page Table - * category. - */ - -/* Architected bits */ -#define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ -#define _PAGE_SW1 0x000002 -#define _PAGE_BIT_SWAP_TYPE 2 -#define _PAGE_BAP_SR 0x000004 -#define _PAGE_BAP_UR 0x000008 -#define _PAGE_BAP_SW 0x000010 -#define _PAGE_BAP_UW 0x000020 -#define _PAGE_BAP_SX 0x000040 -#define _PAGE_BAP_UX 0x000080 -#define _PAGE_PSIZE_MSK 0x000f00 -#define _PAGE_PSIZE_4K 0x000200 -#define _PAGE_PSIZE_8K 0x000300 -#define _PAGE_PSIZE_16K 0x000400 -#define _PAGE_PSIZE_32K 0x000500 -#define _PAGE_PSIZE_64K 0x000600 -#define _PAGE_PSIZE_128K 0x000700 -#define _PAGE_PSIZE_256K 0x000800 -#define _PAGE_PSIZE_512K 0x000900 -#define _PAGE_PSIZE_1M 0x000a00 -#define _PAGE_PSIZE_2M 0x000b00 -#define _PAGE_PSIZE_4M 0x000c00 -#define _PAGE_PSIZE_8M 0x000d00 -#define _PAGE_PSIZE_16M 0x000e00 -#define _PAGE_PSIZE_32M 0x000f00 -#define _PAGE_DIRTY 0x001000 /* C: page changed */ -#define _PAGE_SW0 0x002000 -#define _PAGE_U3 0x004000 -#define _PAGE_U2 0x008000 -#define _PAGE_U1 0x010000 -#define _PAGE_U0 0x020000 -#define _PAGE_ACCESSED 0x040000 -#define _PAGE_ENDIAN 0x080000 -#define _PAGE_GUARDED 0x100000 -#define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ -#define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ - -/* "Higher level" linux bit combinations */ -#define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */ -#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ -#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) -#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) -#define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) -#define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) -#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ -#define _PAGE_PRIVILEGED (_PAGE_BAP_SR) - -#define _PAGE_SPECIAL _PAGE_SW0 - -/* Base page size */ -#define _PAGE_PSIZE _PAGE_PSIZE_4K -#define PTE_RPN_SHIFT (24) - -#define PTE_WIMGE_SHIFT (19) -#define PTE_BAP_SHIFT (2) - -/* On 32-bit, we never clear the top part of the PTE */ -#ifdef CONFIG_PPC32 -#define _PTE_NONE_MASK 0xffffffff00000000ULL -#define _PMD_PRESENT 0 -#define _PMD_PRESENT_MASK (PAGE_MASK) -#define _PMD_BAD (~PAGE_MASK) -#define _PMD_USER 0 -#else -#define _PTE_NONE_MASK 0 -#endif - -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#if defined(CONFIG_SMP) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) -#else -#define _PAGE_BASE (_PAGE_BASE_NC) -#endif - -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_BAP_UX) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) - -#ifndef __ASSEMBLY__ -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); -} - -#define pte_mkprivileged pte_mkprivileged - -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); -} - -#define pte_mkuser pte_mkuser - -static inline pte_t pte_mkexec(pte_t pte) -{ - if (pte_val(pte) & _PAGE_BAP_UR) - return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); - else - return __pte((pte_val(pte) & ~_PAGE_BAP_UX) | _PAGE_BAP_SX); -} -#define pte_mkexec pte_mkexec - -#endif /* __ASSEMBLY__ */ - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_NOHASH_PTE_BOOK3E_H */ diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h new file mode 100644 index 000000000000..0934e8965e4e --- /dev/null +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_NOHASH_PTE_E500_H +#define _ASM_POWERPC_NOHASH_PTE_E500_H +#ifdef __KERNEL__ + +/* PTE bit definitions for processors compliant to the Book3E + * architecture 2.06 or later. The position of the PTE bits + * matches the HW definition of the optional Embedded Page Table + * category. + */ + +/* Architected bits */ +#define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ +#define _PAGE_SW1 0x000002 +#define _PAGE_BIT_SWAP_TYPE 2 +#define _PAGE_BAP_SR 0x000004 +#define _PAGE_BAP_UR 0x000008 +#define _PAGE_BAP_SW 0x000010 +#define _PAGE_BAP_UW 0x000020 +#define _PAGE_BAP_SX 0x000040 +#define _PAGE_BAP_UX 0x000080 +#define _PAGE_PSIZE_MSK 0x000f00 +#define _PAGE_PSIZE_4K 0x000200 +#define _PAGE_PSIZE_8K 0x000300 +#define _PAGE_PSIZE_16K 0x000400 +#define _PAGE_PSIZE_32K 0x000500 +#define _PAGE_PSIZE_64K 0x000600 +#define _PAGE_PSIZE_128K 0x000700 +#define _PAGE_PSIZE_256K 0x000800 +#define _PAGE_PSIZE_512K 0x000900 +#define _PAGE_PSIZE_1M 0x000a00 +#define _PAGE_PSIZE_2M 0x000b00 +#define _PAGE_PSIZE_4M 0x000c00 +#define _PAGE_PSIZE_8M 0x000d00 +#define _PAGE_PSIZE_16M 0x000e00 +#define _PAGE_PSIZE_32M 0x000f00 +#define _PAGE_DIRTY 0x001000 /* C: page changed */ +#define _PAGE_SW0 0x002000 +#define _PAGE_U3 0x004000 +#define _PAGE_U2 0x008000 +#define _PAGE_U1 0x010000 +#define _PAGE_U0 0x020000 +#define _PAGE_ACCESSED 0x040000 +#define _PAGE_ENDIAN 0x080000 +#define _PAGE_GUARDED 0x100000 +#define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ +#define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ + +/* "Higher level" linux bit combinations */ +#define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */ +#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) +#define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) +#define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) +#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ +#define _PAGE_PRIVILEGED (_PAGE_BAP_SR) + +#define _PAGE_SPECIAL _PAGE_SW0 + +/* Base page size */ +#define _PAGE_PSIZE _PAGE_PSIZE_4K +#define PTE_RPN_SHIFT (24) + +#define PTE_WIMGE_SHIFT (19) +#define PTE_BAP_SHIFT (2) + +/* On 32-bit, we never clear the top part of the PTE */ +#ifdef CONFIG_PPC32 +#define _PTE_NONE_MASK 0xffffffff00000000ULL +#define _PMD_PRESENT 0 +#define _PMD_PRESENT_MASK (PAGE_MASK) +#define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 +#else +#define _PTE_NONE_MASK 0 +#endif + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#if defined(CONFIG_SMP) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_BAP_UX) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) + +#ifndef __ASSEMBLY__ +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); +} + +#define pte_mkprivileged pte_mkprivileged + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); +} + +#define pte_mkuser pte_mkuser + +static inline pte_t pte_mkexec(pte_t pte) +{ + if (pte_val(pte) & _PAGE_BAP_UR) + return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); + else + return __pte((pte_val(pte) & ~_PAGE_BAP_UX) | _PAGE_BAP_SX); +} +#define pte_mkexec pte_mkexec + +#endif /* __ASSEMBLY__ */ + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_NOHASH_PTE_E500_H */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 7f20636d13ed..edf1dd1b0ca9 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -31,7 +31,7 @@ extern unsigned int hpage_shift; #define HPAGE_SHIFT hpage_shift #elif defined(CONFIG_PPC_8xx) #define HPAGE_SHIFT 19 /* 512k pages */ -#elif defined(CONFIG_PPC_FSL_BOOK3E) +#elif defined(CONFIG_PPC_E500) #define HPAGE_SHIFT 22 /* 4M pages */ #endif #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 55149a0384db..7e4fe766e247 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -342,7 +342,7 @@ n: #endif /* various errata or part fixups */ -#if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_E500) #define MFTB(dest) \ 90: mfspr dest, SPRN_TBRL; \ BEGIN_FTR_SECTION_NESTED(96); \ @@ -768,7 +768,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) stringify_in_c(.llong (_target);) \ stringify_in_c(.previous) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 #define BTB_FLUSH(reg) \ lis reg,BUCSR_INIT@h; \ ori reg,reg,BUCSR_INIT@l; \ @@ -776,6 +776,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) isync; #else #define BTB_FLUSH(reg) -#endif /* CONFIG_PPC_FSL_BOOK3E */ +#endif /* CONFIG_PPC_E500 */ #endif /* _ASM_POWERPC_PPC_ASM_H */ diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index dd461b2c825c..85143849a586 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -69,7 +69,7 @@ void do_barrier_nospec_fixups_range(bool enable, void *start, void *end); static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { } #endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 void __init setup_spectre_v2(void); #else static inline void setup_spectre_v2(void) {} diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 33dafd12e81d..658c4dffaa56 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -114,7 +114,7 @@ endif obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o obj-$(CONFIG_MODULES) += module.o module_$(BITS).o obj-$(CONFIG_44x) += cpu_setup_44x.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o +obj-$(CONFIG_PPC_E500) += cpu_setup_e500.o obj-$(CONFIG_PPC_DOORBELL) += dbell.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 10ce03052a19..4ce2a4aa3985 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -59,7 +59,7 @@ #endif #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) #include "../mm/mmu_decl.h" #endif @@ -651,7 +651,7 @@ int main(void) DEFINE(PGD_T_LOG2, PGD_T_LOG2); DEFINE(PTE_T_LOG2, PTE_T_LOG2); #endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 DEFINE(TLBCAM_SIZE, sizeof(struct tlbcam)); OFFSET(TLBCAM_MAS0, tlbcam, MAS0); OFFSET(TLBCAM_MAS1, tlbcam, MAS1); diff --git a/arch/powerpc/kernel/cpu_setup_e500.S b/arch/powerpc/kernel/cpu_setup_e500.S new file mode 100644 index 000000000000..058336079069 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_e500.S @@ -0,0 +1,333 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This file contains low level CPU setup functions. + * Kumar Gala + * Copyright 2009 Freescale Semiconductor, Inc. + * + * Based on cpu_setup_6xx code by + * Benjamin Herrenschmidt + */ + +#include +#include +#include +#include +#include +#include +#include + +_GLOBAL(__e500_icache_setup) + mfspr r0, SPRN_L1CSR1 + andi. r3, r0, L1CSR1_ICE + bnelr /* Already enabled */ + oris r0, r0, L1CSR1_CPE@h + ori r0, r0, (L1CSR1_ICFI | L1CSR1_ICLFR | L1CSR1_ICE) + mtspr SPRN_L1CSR1, r0 /* Enable I-Cache */ + isync + blr + +_GLOBAL(__e500_dcache_setup) + mfspr r0, SPRN_L1CSR0 + andi. r3, r0, L1CSR0_DCE + bnelr /* Already enabled */ + msync + isync + li r0, 0 + mtspr SPRN_L1CSR0, r0 /* Disable */ + msync + isync + li r0, (L1CSR0_DCFI | L1CSR0_CLFC) + mtspr SPRN_L1CSR0, r0 /* Invalidate */ + isync +1: mfspr r0, SPRN_L1CSR0 + andi. r3, r0, L1CSR0_CLFC + bne+ 1b /* Wait for lock bits reset */ + oris r0, r0, L1CSR0_CPE@h + ori r0, r0, L1CSR0_DCE + msync + isync + mtspr SPRN_L1CSR0, r0 /* Enable */ + isync + blr + +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for PW20_WAIT_IDLE_BIT. + */ +#define PW20_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_pw20_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Set PW20_WAIT bit, enable pw20 state*/ + ori r3, r3, PWRMGTCR0_PW20_WAIT + li r11, PW20_WAIT_IDLE_BIT + + /* Set Automatic PW20 Core Idle Count */ + rlwimi r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for AV_WAIT_IDLE_BIT. + */ +#define AV_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_altivec_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Enable Altivec Idle */ + oris r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h + li r11, AV_WAIT_IDLE_BIT + + /* Set Automatic AltiVec Idle Count */ + rlwimi r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +#ifdef CONFIG_PPC_E500MC +_GLOBAL(__setup_cpu_e6500) + mflr r6 +#ifdef CONFIG_PPC64 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: +#endif + bl setup_pw20_idle + bl setup_altivec_idle + bl __setup_cpu_e5500 + mtlr r6 + blr +#endif /* CONFIG_PPC_E500MC */ + +#ifdef CONFIG_PPC32 +#ifdef CONFIG_PPC_E500 +#ifndef CONFIG_PPC_E500MC +_GLOBAL(__setup_cpu_e500v1) +_GLOBAL(__setup_cpu_e500v2) + mflr r4 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_e500_ivors +#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI) + /* Ensure that RFXE is set */ + mfspr r3,SPRN_HID1 + oris r3,r3,HID1_RFXE@h + mtspr SPRN_HID1,r3 +#endif + mtlr r4 + blr +#else /* CONFIG_PPC_E500MC */ +_GLOBAL(__setup_cpu_e500mc) +_GLOBAL(__setup_cpu_e5500) + mflr r5 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_e500mc_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r3, SPRN_MMUCFG + rlwinm. r3, r3, 0, MMUCFG_LPIDSIZE + beq 1f + bl __setup_ehv_ivors + b 2f +1: + lwz r3, CPU_SPEC_FEATURES(r4) + /* We need this check as cpu_setup is also called for + * the secondary cores. So, if we have already cleared + * the feature on the primary core, avoid doing it on the + * secondary core. + */ + andi. r6, r3, CPU_FTR_EMB_HV + beq 2f + rlwinm r3, r3, 0, ~CPU_FTR_EMB_HV + stw r3, CPU_SPEC_FEATURES(r4) +2: + mtlr r5 + blr +#endif /* CONFIG_PPC_E500MC */ +#endif /* CONFIG_PPC_E500 */ +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC_BOOK3E_64 +_GLOBAL(__restore_cpu_e6500) + mflr r5 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: + bl setup_pw20_idle + bl setup_altivec_idle + bl __restore_cpu_e5500 + mtlr r5 + blr + +_GLOBAL(__restore_cpu_e5500) + mflr r4 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_ehv_ivors +1: + mtlr r4 + blr + +_GLOBAL(__setup_cpu_e5500) + mflr r5 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_ehv_ivors + b 2f +1: + ld r10,CPU_SPEC_FEATURES(r4) + LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV) + andc r10,r10,r9 + std r10,CPU_SPEC_FEATURES(r4) +2: + mtlr r5 + blr +#endif + +/* flush L1 data cache, it can apply to e500v2, e500mc and e5500 */ +_GLOBAL(flush_dcache_L1) + mfmsr r10 + wrteei 0 + + mfspr r3,SPRN_L1CFG0 + rlwinm r5,r3,9,3 /* Extract cache block size */ + twlgti r5,1 /* Only 32 and 64 byte cache blocks + * are currently defined. + */ + li r4,32 + subfic r6,r5,2 /* r6 = log2(1KiB / cache block size) - + * log2(number of ways) + */ + slw r5,r4,r5 /* r5 = cache block size */ + + rlwinm r7,r3,0,0xff /* Extract number of KiB in the cache */ + mulli r7,r7,13 /* An 8-way cache will require 13 + * loads per set. + */ + slw r7,r7,r6 + + /* save off HID0 and set DCFA */ + mfspr r8,SPRN_HID0 + ori r9,r8,HID0_DCFA@l + mtspr SPRN_HID0,r9 + isync + + LOAD_REG_IMMEDIATE(r6, KERNELBASE) + mr r4, r6 + mtctr r7 + +1: lwz r3,0(r4) /* Load... */ + add r4,r4,r5 + bdnz 1b + + msync + mr r4, r6 + mtctr r7 + +1: dcbf 0,r4 /* ...and flush. */ + add r4,r4,r5 + bdnz 1b + + /* restore HID0 */ + mtspr SPRN_HID0,r8 + isync + + wrtee r10 + + blr + +has_L2_cache: + /* skip L2 cache on P2040/P2040E as they have no L2 cache */ + mfspr r3, SPRN_SVR + /* shift right by 8 bits and clear E bit of SVR */ + rlwinm r4, r3, 24, ~0x800 + + lis r3, SVR_P2040@h + ori r3, r3, SVR_P2040@l + cmpw r4, r3 + beq 1f + + li r3, 1 + blr +1: + li r3, 0 + blr + +/* flush backside L2 cache */ +flush_backside_L2_cache: + mflr r10 + bl has_L2_cache + mtlr r10 + cmpwi r3, 0 + beq 2f + + /* Flush the L2 cache */ + mfspr r3, SPRN_L2CSR0 + ori r3, r3, L2CSR0_L2FL@l + msync + isync + mtspr SPRN_L2CSR0,r3 + isync + + /* check if it is complete */ +1: mfspr r3,SPRN_L2CSR0 + andi. r3, r3, L2CSR0_L2FL@l + bne 1b +2: + blr + +_GLOBAL(cpu_down_flush_e500v2) + mflr r0 + bl flush_dcache_L1 + mtlr r0 + blr + +_GLOBAL(cpu_down_flush_e500mc) +_GLOBAL(cpu_down_flush_e5500) + mflr r0 + bl flush_dcache_L1 + bl flush_backside_L2_cache + mtlr r0 + blr + +/* L1 Data Cache of e6500 contains no modified data, no flush is required */ +_GLOBAL(cpu_down_flush_e6500) + blr diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S deleted file mode 100644 index 058336079069..000000000000 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ /dev/null @@ -1,333 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains low level CPU setup functions. - * Kumar Gala - * Copyright 2009 Freescale Semiconductor, Inc. - * - * Based on cpu_setup_6xx code by - * Benjamin Herrenschmidt - */ - -#include -#include -#include -#include -#include -#include -#include - -_GLOBAL(__e500_icache_setup) - mfspr r0, SPRN_L1CSR1 - andi. r3, r0, L1CSR1_ICE - bnelr /* Already enabled */ - oris r0, r0, L1CSR1_CPE@h - ori r0, r0, (L1CSR1_ICFI | L1CSR1_ICLFR | L1CSR1_ICE) - mtspr SPRN_L1CSR1, r0 /* Enable I-Cache */ - isync - blr - -_GLOBAL(__e500_dcache_setup) - mfspr r0, SPRN_L1CSR0 - andi. r3, r0, L1CSR0_DCE - bnelr /* Already enabled */ - msync - isync - li r0, 0 - mtspr SPRN_L1CSR0, r0 /* Disable */ - msync - isync - li r0, (L1CSR0_DCFI | L1CSR0_CLFC) - mtspr SPRN_L1CSR0, r0 /* Invalidate */ - isync -1: mfspr r0, SPRN_L1CSR0 - andi. r3, r0, L1CSR0_CLFC - bne+ 1b /* Wait for lock bits reset */ - oris r0, r0, L1CSR0_CPE@h - ori r0, r0, L1CSR0_DCE - msync - isync - mtspr SPRN_L1CSR0, r0 /* Enable */ - isync - blr - -/* - * FIXME - we haven't yet done testing to determine a reasonable default - * value for PW20_WAIT_IDLE_BIT. - */ -#define PW20_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ -_GLOBAL(setup_pw20_idle) - mfspr r3, SPRN_PWRMGTCR0 - - /* Set PW20_WAIT bit, enable pw20 state*/ - ori r3, r3, PWRMGTCR0_PW20_WAIT - li r11, PW20_WAIT_IDLE_BIT - - /* Set Automatic PW20 Core Idle Count */ - rlwimi r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT - - mtspr SPRN_PWRMGTCR0, r3 - - blr - -/* - * FIXME - we haven't yet done testing to determine a reasonable default - * value for AV_WAIT_IDLE_BIT. - */ -#define AV_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ -_GLOBAL(setup_altivec_idle) - mfspr r3, SPRN_PWRMGTCR0 - - /* Enable Altivec Idle */ - oris r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h - li r11, AV_WAIT_IDLE_BIT - - /* Set Automatic AltiVec Idle Count */ - rlwimi r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT - - mtspr SPRN_PWRMGTCR0, r3 - - blr - -#ifdef CONFIG_PPC_E500MC -_GLOBAL(__setup_cpu_e6500) - mflr r6 -#ifdef CONFIG_PPC64 - bl setup_altivec_ivors - /* Touch IVOR42 only if the CPU supports E.HV category */ - mfspr r10,SPRN_MMUCFG - rlwinm. r10,r10,0,MMUCFG_LPIDSIZE - beq 1f - bl setup_lrat_ivor -1: -#endif - bl setup_pw20_idle - bl setup_altivec_idle - bl __setup_cpu_e5500 - mtlr r6 - blr -#endif /* CONFIG_PPC_E500MC */ - -#ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_E500 -#ifndef CONFIG_PPC_E500MC -_GLOBAL(__setup_cpu_e500v1) -_GLOBAL(__setup_cpu_e500v2) - mflr r4 - bl __e500_icache_setup - bl __e500_dcache_setup - bl __setup_e500_ivors -#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI) - /* Ensure that RFXE is set */ - mfspr r3,SPRN_HID1 - oris r3,r3,HID1_RFXE@h - mtspr SPRN_HID1,r3 -#endif - mtlr r4 - blr -#else /* CONFIG_PPC_E500MC */ -_GLOBAL(__setup_cpu_e500mc) -_GLOBAL(__setup_cpu_e5500) - mflr r5 - bl __e500_icache_setup - bl __e500_dcache_setup - bl __setup_e500mc_ivors - /* - * We only want to touch IVOR38-41 if we're running on hardware - * that supports category E.HV. The architectural way to determine - * this is MMUCFG[LPIDSIZE]. - */ - mfspr r3, SPRN_MMUCFG - rlwinm. r3, r3, 0, MMUCFG_LPIDSIZE - beq 1f - bl __setup_ehv_ivors - b 2f -1: - lwz r3, CPU_SPEC_FEATURES(r4) - /* We need this check as cpu_setup is also called for - * the secondary cores. So, if we have already cleared - * the feature on the primary core, avoid doing it on the - * secondary core. - */ - andi. r6, r3, CPU_FTR_EMB_HV - beq 2f - rlwinm r3, r3, 0, ~CPU_FTR_EMB_HV - stw r3, CPU_SPEC_FEATURES(r4) -2: - mtlr r5 - blr -#endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_PPC_E500 */ -#endif /* CONFIG_PPC32 */ - -#ifdef CONFIG_PPC_BOOK3E_64 -_GLOBAL(__restore_cpu_e6500) - mflr r5 - bl setup_altivec_ivors - /* Touch IVOR42 only if the CPU supports E.HV category */ - mfspr r10,SPRN_MMUCFG - rlwinm. r10,r10,0,MMUCFG_LPIDSIZE - beq 1f - bl setup_lrat_ivor -1: - bl setup_pw20_idle - bl setup_altivec_idle - bl __restore_cpu_e5500 - mtlr r5 - blr - -_GLOBAL(__restore_cpu_e5500) - mflr r4 - bl __e500_icache_setup - bl __e500_dcache_setup - bl __setup_base_ivors - bl setup_perfmon_ivor - bl setup_doorbell_ivors - /* - * We only want to touch IVOR38-41 if we're running on hardware - * that supports category E.HV. The architectural way to determine - * this is MMUCFG[LPIDSIZE]. - */ - mfspr r10,SPRN_MMUCFG - rlwinm. r10,r10,0,MMUCFG_LPIDSIZE - beq 1f - bl setup_ehv_ivors -1: - mtlr r4 - blr - -_GLOBAL(__setup_cpu_e5500) - mflr r5 - bl __e500_icache_setup - bl __e500_dcache_setup - bl __setup_base_ivors - bl setup_perfmon_ivor - bl setup_doorbell_ivors - /* - * We only want to touch IVOR38-41 if we're running on hardware - * that supports category E.HV. The architectural way to determine - * this is MMUCFG[LPIDSIZE]. - */ - mfspr r10,SPRN_MMUCFG - rlwinm. r10,r10,0,MMUCFG_LPIDSIZE - beq 1f - bl setup_ehv_ivors - b 2f -1: - ld r10,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV) - andc r10,r10,r9 - std r10,CPU_SPEC_FEATURES(r4) -2: - mtlr r5 - blr -#endif - -/* flush L1 data cache, it can apply to e500v2, e500mc and e5500 */ -_GLOBAL(flush_dcache_L1) - mfmsr r10 - wrteei 0 - - mfspr r3,SPRN_L1CFG0 - rlwinm r5,r3,9,3 /* Extract cache block size */ - twlgti r5,1 /* Only 32 and 64 byte cache blocks - * are currently defined. - */ - li r4,32 - subfic r6,r5,2 /* r6 = log2(1KiB / cache block size) - - * log2(number of ways) - */ - slw r5,r4,r5 /* r5 = cache block size */ - - rlwinm r7,r3,0,0xff /* Extract number of KiB in the cache */ - mulli r7,r7,13 /* An 8-way cache will require 13 - * loads per set. - */ - slw r7,r7,r6 - - /* save off HID0 and set DCFA */ - mfspr r8,SPRN_HID0 - ori r9,r8,HID0_DCFA@l - mtspr SPRN_HID0,r9 - isync - - LOAD_REG_IMMEDIATE(r6, KERNELBASE) - mr r4, r6 - mtctr r7 - -1: lwz r3,0(r4) /* Load... */ - add r4,r4,r5 - bdnz 1b - - msync - mr r4, r6 - mtctr r7 - -1: dcbf 0,r4 /* ...and flush. */ - add r4,r4,r5 - bdnz 1b - - /* restore HID0 */ - mtspr SPRN_HID0,r8 - isync - - wrtee r10 - - blr - -has_L2_cache: - /* skip L2 cache on P2040/P2040E as they have no L2 cache */ - mfspr r3, SPRN_SVR - /* shift right by 8 bits and clear E bit of SVR */ - rlwinm r4, r3, 24, ~0x800 - - lis r3, SVR_P2040@h - ori r3, r3, SVR_P2040@l - cmpw r4, r3 - beq 1f - - li r3, 1 - blr -1: - li r3, 0 - blr - -/* flush backside L2 cache */ -flush_backside_L2_cache: - mflr r10 - bl has_L2_cache - mtlr r10 - cmpwi r3, 0 - beq 2f - - /* Flush the L2 cache */ - mfspr r3, SPRN_L2CSR0 - ori r3, r3, L2CSR0_L2FL@l - msync - isync - mtspr SPRN_L2CSR0,r3 - isync - - /* check if it is complete */ -1: mfspr r3,SPRN_L2CSR0 - andi. r3, r3, L2CSR0_L2FL@l - bne 1b -2: - blr - -_GLOBAL(cpu_down_flush_e500v2) - mflr r0 - bl flush_dcache_L1 - mtlr r0 - blr - -_GLOBAL(cpu_down_flush_e500mc) -_GLOBAL(cpu_down_flush_e5500) - mflr r0 - bl flush_dcache_L1 - bl flush_backside_L2_cache - mtlr r0 - blr - -/* L1 Data Cache of e6500 contains no modified data, no flush is required */ -_GLOBAL(cpu_down_flush_e6500) - blr diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index a2f82ced6e4a..1047dc053b47 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -34,7 +34,7 @@ */ #define THREAD_NORMSAVE(offset) (THREAD_NORMSAVES + (offset * 4)) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 #define BOOKE_CLEAR_BTB(reg) \ START_BTB_FLUSH_SECTION \ BTB_FLUSH(reg) \ diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 4b4ba3364665..a2d3abb48075 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -230,7 +230,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_common) std r0,GPR0(r1) std r10,GPR1(r1) std r2,GPR2(r1) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index b562a1d2c750..206475e3e0b4 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -35,7 +35,7 @@ static enum branch_cache_flush_type link_stack_flush_type = BRANCH_CACHE_FLUSH_N bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -122,7 +122,7 @@ static __init int security_feature_debugfs_init(void) device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -130,9 +130,9 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); -#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_E500 || CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 void __init setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) @@ -140,7 +140,7 @@ void __init setup_spectre_v2(void) else btb_flush_enabled = true; } -#endif /* CONFIG_PPC_FSL_BOOK3E */ +#endif /* CONFIG_PPC_E500 */ #ifdef CONFIG_PPC_BOOK3S_64 ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 169703fead57..11ded19186b9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -708,7 +708,7 @@ static struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 per_cpu(next_tlbcam_idx, id) = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 3a10cda9c05e..ef9a61718940 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -228,7 +228,7 @@ static void __init sysfs_create_dscr_default(void) } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 #define MAX_BIT 63 static u64 pw20_wt; @@ -907,7 +907,7 @@ static int register_cpu_online(unsigned int cpu) device_create_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { device_create_file(s, &dev_attr_pw20_state); device_create_file(s, &dev_attr_pw20_wait_time); @@ -1003,7 +1003,7 @@ static int unregister_cpu_online(unsigned int cpu) device_remove_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { device_remove_file(s, &dev_attr_pw20_state); device_remove_file(s, &dev_attr_pw20_wait_time); diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b60d81acccfc..c025c83dfdc3 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -239,7 +239,7 @@ SECTIONS } #endif /* CONFIG_PPC_BARRIER_NOSPEC */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 . = ALIGN(8); __spec_btb_flush_fixup : AT(ADDR(__spec_btb_flush_fixup) - LOAD_OFFSET) { __start__btb_flush_fixup = .; diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 993d3f31832a..31f40f544de5 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -550,7 +550,7 @@ void do_barrier_nospec_fixups(bool enable) } #endif /* CONFIG_PPC_BARRIER_NOSPEC */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) { unsigned int instr[2], *dest; @@ -602,7 +602,7 @@ void __init do_btb_flush_fixups(void) for (; start < end; start += 2) patch_btb_flush_section(start); } -#endif /* CONFIG_PPC_FSL_BOOK3E */ +#endif /* CONFIG_PPC_E500 */ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index bc84a594ca62..8c3ea5300ac3 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -623,7 +623,7 @@ static int __init hugetlbpage_init(void) if (pdshift > shift) { if (!IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(pdshift - shift); - } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || + } else if (IS_ENABLED(CONFIG_PPC_E500) || IS_ENABLED(CONFIG_PPC_8xx)) { pgtable_cache_add(PTE_T_ORDER); } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 6ddbd6cb3a2a..84d171953ba4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -308,7 +308,7 @@ void __init mem_init(void) } #endif /* CONFIG_HIGHMEM */ -#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) +#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up * functions.... do it here for the non-smp case. diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 341c2e0c71d2..bd9784f77f2e 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -111,7 +111,7 @@ void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init); #ifdef CONFIG_PPC32 @@ -157,7 +157,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) void mmu_mark_initmem_nx(void); void mmu_mark_rodata_ro(void); #else diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b467a25ee155..f3894e79d5f7 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -7,13 +7,13 @@ obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_book3e.o +obj-$(CONFIG_PPC_E500) += e500.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_fsl_book3e.o := n +KCOV_INSTRUMENT_e500.o := n diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c deleted file mode 100644 index c7d4b317a823..000000000000 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ /dev/null @@ -1,190 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC Huge TLB Page Support for Book3E MMU - * - * Copyright (C) 2009 David Gibson, IBM Corporation. - * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor - * - */ -#include -#include - -#include - -#ifdef CONFIG_PPC64 -#include - -static inline int tlb1_next(void) -{ - struct paca_struct *paca = get_paca(); - struct tlb_core_data *tcd; - int this, next; - - tcd = paca->tcd_ptr; - this = tcd->esel_next; - - next = this + 1; - if (next >= tcd->esel_max) - next = tcd->esel_first; - - tcd->esel_next = next; - return this; -} - -static inline void book3e_tlb_lock(void) -{ - struct paca_struct *paca = get_paca(); - unsigned long tmp; - int token = smp_processor_id() + 1; - - /* - * Besides being unnecessary in the absence of SMT, this - * check prevents trying to do lbarx/stbcx. on e5500 which - * doesn't implement either feature. - */ - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - asm volatile("1: lbarx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2f;" - "stbcx. %2, 0, %1;" - "bne 1b;" - "b 3f;" - "2: lbzx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2b;" - "b 1b;" - "3:" - : "=&r" (tmp) - : "r" (&paca->tcd_ptr->lock), "r" (token) - : "memory"); -} - -static inline void book3e_tlb_unlock(void) -{ - struct paca_struct *paca = get_paca(); - - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - isync(); - paca->tcd_ptr->lock = 0; -} -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} - -static inline void book3e_tlb_lock(void) -{ -} - -static inline void book3e_tlb_unlock(void) -{ -} -#endif - -static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) -{ - int found = 0; - - mtspr(SPRN_MAS6, pid << 16); - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - - return found; -} - -static void -book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) -{ - unsigned long mas1, mas2; - u64 mas7_3; - unsigned long psize, tsize, shift; - unsigned long flags; - struct mm_struct *mm; - int index; - - if (unlikely(is_kernel_addr(ea))) - return; - - mm = vma->vm_mm; - - psize = vma_mmu_pagesize(vma); - shift = __ilog2(psize); - tsize = shift - 10; - /* - * We can't be interrupted while we're setting up the MAS - * registers or after we've confirmed that no tlb exists. - */ - local_irq_save(flags); - - book3e_tlb_lock(); - - if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { - book3e_tlb_unlock(); - local_irq_restore(flags); - return; - } - - /* We have to use the CAM(TLB1) on FSL parts for hugepages */ - index = tlb1_next(); - mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); - - mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); - mas2 = ea & ~((1UL << shift) - 1); - mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; - mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; - if (!pte_dirty(pte)) - mas7_3 &= ~(MAS3_SW|MAS3_UW); - - mtspr(SPRN_MAS1, mas1); - mtspr(SPRN_MAS2, mas2); - - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - - asm volatile ("tlbwe"); - - book3e_tlb_unlock(); - local_irq_restore(flags); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * - * This must always be called with the pte lock held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -{ - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -} - -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct hstate *hstate = hstate_file(vma->vm_file); - unsigned long tsize = huge_page_shift(hstate) - 10; - - __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); -} diff --git a/arch/powerpc/mm/nohash/e500.c b/arch/powerpc/mm/nohash/e500.c new file mode 100644 index 000000000000..40a4e69ae1a9 --- /dev/null +++ b/arch/powerpc/mm/nohash/e500.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support + * E500 Book E processors. + * + * Copyright 2004,2010 Freescale Semiconductor, Inc. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned int tlbcam_index; + +struct tlbcam TLBCAM[NUM_TLBCAMS]; + +static struct { + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} tlbcam_addrs[NUM_TLBCAMS]; + +#ifdef CONFIG_PPC_85xx +/* + * Return PA for this VA if it is mapped by a CAM, or 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) + return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (pa >= tlbcam_addrs[b].phys + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + +tlbcam_addrs[b].phys) + return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); + return 0; +} +#endif + +/* + * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). + */ +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) +{ + unsigned int tsize; + + tsize = __ilog2(size) - 10; + +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) + if ((flags & _PAGE_NO_CACHE) == 0) + flags |= _PAGE_COHERENT; +#endif + + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); + TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); + TLBCAM[index].MAS2 = virt & PAGE_MASK; + + TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; + + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; + TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + TLBCAM[index].MAS7 = (u64)phys >> 32; + + /* Below is unlikely -- only for large user pages or similar */ + if (pte_user(__pte(flags))) { + TLBCAM[index].MAS3 |= MAS3_UR; + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; + } else { + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; + } + + tlbcam_addrs[index].start = virt; + tlbcam_addrs[index].limit = virt + size - 1; + tlbcam_addrs[index].phys = phys; +} + +static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) +{ + unsigned int camsize = __ilog2(ram); + unsigned int align = __ffs(virt | phys); + unsigned long max_cam; + + if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; + camsize &= ~1U; + align &= ~1U; + } else { + /* Convert (2^max) kB to (2^max) bytes */ + max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; + } + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; +} + +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx, + bool dryrun, bool init) +{ + int i; + unsigned long amount_mapped = 0; + unsigned long boundary; + + if (strict_kernel_rwx_enabled()) + boundary = (unsigned long)(_sinittext - _stext); + else + boundary = ram; + + /* Calculate CAM values */ + for (i = 0; boundary && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; + + cam_sz = calc_cam_sz(boundary, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + boundary -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; + + cam_sz = calc_cam_sz(ram, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + + if (dryrun) + return amount_mapped; + + if (init) { + loadcam_multi(0, i, max_cam_idx); + tlbcam_index = i; + } else { + loadcam_multi(0, i, 0); + WARN_ON(i > tlbcam_index); + } + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif + + return amount_mapped; +} + +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); +} + +#ifdef CONFIG_PPC32 + +#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) +#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" +#endif + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; +} + +void flush_instruction_cache(void) +{ + unsigned long tmp; + + tmp = mfspr(SPRN_L1CSR1); + tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; + mtspr(SPRN_L1CSR1, tmp); + isync(); +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + flush_instruction_cache(); +} + +static unsigned long __init tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + +void __init adjust_total_lowmem(void) +{ + unsigned long ram; + int i; + + /* adjust lowmem size to __max_low_memory */ + ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); + + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); + restore_to_as0(i, 0, NULL, 1); + + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, + (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + + memblock_set_current_limit(memstart_addr + __max_low_memory); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + unsigned long remapped; + + remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); + + WARN_ON(__max_low_memory != remapped); +} +#endif + +void mmu_mark_initmem_nx(void) +{ + /* Everything is done in mmu_mark_rodata_ro() */ +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + phys_addr_t limit = first_memblock_base + first_memblock_size; + + /* 64M mapped initially according to head_fsl_booke.S */ + memblock_set_current_limit(min_t(u64, limit, 0x04000000)); +} + +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = kernstart_virt_addr; + phys_addr_t size; + + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + kaslr_late_init(); + return; + } + + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), &size); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } + + kaslr_early_init(__va(dt_ptr), size); +} +#endif +#endif diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c new file mode 100644 index 000000000000..c7d4b317a823 --- /dev/null +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include +#include + +#include + +#ifdef CONFIG_PPC64 +#include + +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} + +static inline void book3e_tlb_lock(void) +{ + struct paca_struct *paca = get_paca(); + unsigned long tmp; + int token = smp_processor_id() + 1; + + /* + * Besides being unnecessary in the absence of SMT, this + * check prevents trying to do lbarx/stbcx. on e5500 which + * doesn't implement either feature. + */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + asm volatile("1: lbarx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2f;" + "stbcx. %2, 0, %1;" + "bne 1b;" + "b 3f;" + "2: lbzx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2b;" + "b 1b;" + "3:" + : "=&r" (tmp) + : "r" (&paca->tcd_ptr->lock), "r" (token) + : "memory"); +} + +static inline void book3e_tlb_unlock(void) +{ + struct paca_struct *paca = get_paca(); + + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + isync(); + paca->tcd_ptr->lock = 0; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + +static inline void book3e_tlb_lock(void) +{ +} + +static inline void book3e_tlb_unlock(void) +{ +} +#endif + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + + return found; +} + +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + int index; + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; + /* + * We can't be interrupted while we're setting up the MAS + * registers or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + book3e_tlb_lock(); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + book3e_tlb_unlock(); + local_irq_restore(flags); + return; + } + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + + asm volatile ("tlbwe"); + + book3e_tlb_unlock(); + local_irq_restore(flags); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c deleted file mode 100644 index 40a4e69ae1a9..000000000000 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ /dev/null @@ -1,375 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Modifications by Kumar Gala (galak@kernel.crashing.org) to support - * E500 Book E processors. - * - * Copyright 2004,2010 Freescale Semiconductor, Inc. - * - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -unsigned int tlbcam_index; - -struct tlbcam TLBCAM[NUM_TLBCAMS]; - -static struct { - unsigned long start; - unsigned long limit; - phys_addr_t phys; -} tlbcam_addrs[NUM_TLBCAMS]; - -#ifdef CONFIG_PPC_85xx -/* - * Return PA for this VA if it is mapped by a CAM, or 0 - */ -phys_addr_t v_block_mapped(unsigned long va) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) - return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); - return 0; -} - -/* - * Return VA for a given PA or 0 if not mapped - */ -unsigned long p_block_mapped(phys_addr_t pa) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (pa >= tlbcam_addrs[b].phys - && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) - +tlbcam_addrs[b].phys) - return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); - return 0; -} -#endif - -/* - * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; - * in particular size must be a power of 4 between 4k and the max supported by - * an implementation; max may further be limited by what can be represented in - * an unsigned long (for example, 32-bit implementations cannot support a 4GB - * size). - */ -static void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned long size, unsigned long flags, unsigned int pid) -{ - unsigned int tsize; - - tsize = __ilog2(size) - 10; - -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) - if ((flags & _PAGE_NO_CACHE) == 0) - flags |= _PAGE_COHERENT; -#endif - - TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); - TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); - TLBCAM[index].MAS2 = virt & PAGE_MASK; - - TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - - TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - TLBCAM[index].MAS7 = (u64)phys >> 32; - - /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { - TLBCAM[index].MAS3 |= MAS3_UR; - TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; - } else { - TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; - } - - tlbcam_addrs[index].start = virt; - tlbcam_addrs[index].limit = virt + size - 1; - tlbcam_addrs[index].phys = phys; -} - -static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) -{ - unsigned int camsize = __ilog2(ram); - unsigned int align = __ffs(virt | phys); - unsigned long max_cam; - - if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - /* Convert (4^max) kB to (2^max) bytes */ - max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; - camsize &= ~1U; - align &= ~1U; - } else { - /* Convert (2^max) kB to (2^max) bytes */ - max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; - } - - if (camsize > align) - camsize = align; - if (camsize > max_cam) - camsize = max_cam; - - return 1UL << camsize; -} - -static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, - unsigned long ram, int max_cam_idx, - bool dryrun, bool init) -{ - int i; - unsigned long amount_mapped = 0; - unsigned long boundary; - - if (strict_kernel_rwx_enabled()) - boundary = (unsigned long)(_sinittext - _stext); - else - boundary = ram; - - /* Calculate CAM values */ - for (i = 0; boundary && i < max_cam_idx; i++) { - unsigned long cam_sz; - pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; - - cam_sz = calc_cam_sz(boundary, virt, phys); - if (!dryrun) - settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); - - boundary -= cam_sz; - amount_mapped += cam_sz; - virt += cam_sz; - phys += cam_sz; - } - for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { - unsigned long cam_sz; - pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; - - cam_sz = calc_cam_sz(ram, virt, phys); - if (!dryrun) - settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); - - ram -= cam_sz; - amount_mapped += cam_sz; - virt += cam_sz; - phys += cam_sz; - } - - if (dryrun) - return amount_mapped; - - if (init) { - loadcam_multi(0, i, max_cam_idx); - tlbcam_index = i; - } else { - loadcam_multi(0, i, 0); - WARN_ON(i > tlbcam_index); - } - -#ifdef CONFIG_PPC64 - get_paca()->tcd.esel_next = i; - get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - get_paca()->tcd.esel_first = i; -#endif - - return amount_mapped; -} - -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) -{ - unsigned long virt = PAGE_OFFSET; - phys_addr_t phys = memstart_addr; - - return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); -} - -#ifdef CONFIG_PPC32 - -#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) -#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" -#endif - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; -} - -void flush_instruction_cache(void) -{ - unsigned long tmp; - - tmp = mfspr(SPRN_L1CSR1); - tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; - mtspr(SPRN_L1CSR1, tmp); - isync(); -} - -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - flush_instruction_cache(); -} - -static unsigned long __init tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - -void __init adjust_total_lowmem(void) -{ - unsigned long ram; - int i; - - /* adjust lowmem size to __max_low_memory */ - ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); - - i = switch_to_as1(); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); - restore_to_as0(i, 0, NULL, 1); - - pr_info("Memory CAM mapping: "); - for (i = 0; i < tlbcam_index - 1; i++) - pr_cont("%lu/", tlbcam_sz(i) >> 20); - pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, - (unsigned int)((total_lowmem - __max_low_memory) >> 20)); - - memblock_set_current_limit(memstart_addr + __max_low_memory); -} - -#ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) -{ - unsigned long remapped; - - remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); - - WARN_ON(__max_low_memory != remapped); -} -#endif - -void mmu_mark_initmem_nx(void) -{ - /* Everything is done in mmu_mark_rodata_ro() */ -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - phys_addr_t limit = first_memblock_base + first_memblock_size; - - /* 64M mapped initially according to head_fsl_booke.S */ - memblock_set_current_limit(min_t(u64, limit, 0x04000000)); -} - -#ifdef CONFIG_RELOCATABLE -int __initdata is_second_reloc; -notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) -{ - unsigned long base = kernstart_virt_addr; - phys_addr_t size; - - kernstart_addr = start; - if (is_second_reloc) { - virt_phys_offset = PAGE_OFFSET - memstart_addr; - kaslr_late_init(); - return; - } - - /* - * Relocatable kernel support based on processing of dynamic - * relocation entries. Before we get the real memstart_addr, - * We will compute the virt_phys_offset like this: - * virt_phys_offset = stext.run - kernstart_addr - * - * stext.run = (KERNELBASE & ~0x3ffffff) + - * (kernstart_addr & 0x3ffffff) - * When we relocate, we have : - * - * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) - * - * hence: - * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - - * (kernstart_addr & ~0x3ffffff) - * - */ - start &= ~0x3ffffff; - base &= ~0x3ffffff; - virt_phys_offset = base - start; - early_get_first_memblock_info(__va(dt_ptr), &size); - /* - * We now get the memstart_addr, then we should check if this - * address is the same as what the PAGE_OFFSET map to now. If - * not we have to change the map of PAGE_OFFSET to memstart_addr - * and do a second relocation. - */ - if (start != memstart_addr) { - int n; - long offset = start - memstart_addr; - - is_second_reloc = 1; - n = switch_to_as1(); - /* map a 64M area for the second relocation */ - if (memstart_addr > start) - map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, - false, true); - else - map_mem_in_cams_addr(start, PAGE_OFFSET + offset, - 0x4000000, CONFIG_LOWMEM_CAM_NUM, - false, true); - restore_to_as0(n, offset, __va(dt_ptr), 1); - /* We should never reach here */ - panic("Relocation error"); - } - - kaslr_early_init(__va(dt_ptr), size); -} -#endif -#endif diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index f21896ebdc5a..fcb1e5ae5c55 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -50,7 +50,7 @@ * indirect page table entries. */ #if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -166,7 +166,7 @@ int extlb_level_exc; #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 /* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ DEFINE_PER_CPU(int, next_tlbcam_idx); EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); @@ -441,7 +441,7 @@ static void __init setup_page_sizes(void) unsigned int eptcfg; int i, psize; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 unsigned int mmucfg = mfspr(SPRN_MMUCFG); int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); @@ -584,7 +584,7 @@ static void __init setup_mmu_htw(void) patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); break; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 case PPC_HTW_E6500: extlb_level_exc = EX_TLB_SIZE; patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); @@ -627,7 +627,7 @@ static void early_init_this_mmu(void) } mtspr(SPRN_MAS4, mas4); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; bool map = true; @@ -680,7 +680,7 @@ static void __init early_init_mmu_global(void) /* Look for HW tablewalk support */ setup_mmu_htw(); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { if (book3e_htw_mode == PPC_HTW_NONE) { extlb_level_exc = EX_TLB_SIZE; @@ -701,7 +701,7 @@ static void __init early_init_mmu_global(void) static void __init early_mmu_set_memory_limit(void) { -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { /* * Limit memory so we dont have linear faults. @@ -750,7 +750,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * We crop it to the size of the first MEMBLOCK to * avoid going over total available memory just in case... */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; unsigned int num_cams; diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 6914bc8e4ead..e1199608ff4d 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -364,7 +364,7 @@ _GLOBAL(_tlbivax_bcast) #error Unsupported processor type ! #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) /* * extern void loadcam_entry(unsigned int index) * diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 5b065186ace5..32c60ad8f45d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -107,7 +107,6 @@ config PPC_BOOK3S_64 config PPC_BOOK3E_64 bool "Embedded processors" - select PPC_FSL_BOOK3E select PPC_E500 select PPC_E500MC select PPC_FPU # Make it a choice ? @@ -259,8 +258,11 @@ config PPC_BOOK3S config PPC_E500 select FSL_EMB_PERFMON - select PPC_FSL_BOOK3E bool + select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 + select PPC_SMP_MUXED_IPI + select PPC_DOORBELL + select PPC_KUEP config PPC_E500MC bool "e500mc Support" @@ -320,16 +322,6 @@ config BOOKE_OR_40x depends on BOOKE || 40x default y -# this is for common code between PPC32 & PPC64 FSL BOOKE -config PPC_FSL_BOOK3E - bool - select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 - imply FSL_EMB_PERFMON - select PPC_SMP_MUXED_IPI - select PPC_DOORBELL - select PPC_KUEP - default y if PPC_85xx - config PTE_64BIT bool depends on 44x || PPC_E500 || PPC_86xx -- cgit v1.2.3 From aa5f59df201dd350f7c291c845ac8b62c0d0edd5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:39 +0200 Subject: powerpc: Remove CONFIG_PPC_BOOK3E_MMU CONFIG_PPC_BOOK3E_MMU is redundant with CONFIG_PPC_E500. Remove it. Also rename mmu-book3e.h to mmu-e500.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c5549cd59a131204ff94ab909cad2e2dad4ddf2f.1663606876.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/mmu-book3e.h | 324 --------------------------- arch/powerpc/include/asm/nohash/mmu-e500.h | 324 +++++++++++++++++++++++++++ arch/powerpc/include/asm/nohash/mmu.h | 4 +- arch/powerpc/kernel/cpu_setup_e500.S | 2 +- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/head_booke.h | 4 +- arch/powerpc/kernel/kvm.c | 8 +- arch/powerpc/kvm/e500.h | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 +- arch/powerpc/mm/ptdump/Makefile | 2 +- arch/powerpc/platforms/Kconfig.cputype | 4 - 11 files changed, 338 insertions(+), 342 deletions(-) delete mode 100644 arch/powerpc/include/asm/nohash/mmu-book3e.h create mode 100644 arch/powerpc/include/asm/nohash/mmu-e500.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/nohash/mmu-book3e.h b/arch/powerpc/include/asm/nohash/mmu-book3e.h deleted file mode 100644 index e43a418d3ccd..000000000000 --- a/arch/powerpc/include/asm/nohash/mmu-book3e.h +++ /dev/null @@ -1,324 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_MMU_BOOK3E_H_ -#define _ASM_POWERPC_MMU_BOOK3E_H_ -/* - * Freescale Book-E/Book-3e (ISA 2.06+) MMU support - */ - -/* Book-3e defined page sizes */ -#define BOOK3E_PAGESZ_1K 0 -#define BOOK3E_PAGESZ_2K 1 -#define BOOK3E_PAGESZ_4K 2 -#define BOOK3E_PAGESZ_8K 3 -#define BOOK3E_PAGESZ_16K 4 -#define BOOK3E_PAGESZ_32K 5 -#define BOOK3E_PAGESZ_64K 6 -#define BOOK3E_PAGESZ_128K 7 -#define BOOK3E_PAGESZ_256K 8 -#define BOOK3E_PAGESZ_512K 9 -#define BOOK3E_PAGESZ_1M 10 -#define BOOK3E_PAGESZ_2M 11 -#define BOOK3E_PAGESZ_4M 12 -#define BOOK3E_PAGESZ_8M 13 -#define BOOK3E_PAGESZ_16M 14 -#define BOOK3E_PAGESZ_32M 15 -#define BOOK3E_PAGESZ_64M 16 -#define BOOK3E_PAGESZ_128M 17 -#define BOOK3E_PAGESZ_256M 18 -#define BOOK3E_PAGESZ_512M 19 -#define BOOK3E_PAGESZ_1GB 20 -#define BOOK3E_PAGESZ_2GB 21 -#define BOOK3E_PAGESZ_4GB 22 -#define BOOK3E_PAGESZ_8GB 23 -#define BOOK3E_PAGESZ_16GB 24 -#define BOOK3E_PAGESZ_32GB 25 -#define BOOK3E_PAGESZ_64GB 26 -#define BOOK3E_PAGESZ_128GB 27 -#define BOOK3E_PAGESZ_256GB 28 -#define BOOK3E_PAGESZ_512GB 29 -#define BOOK3E_PAGESZ_1TB 30 -#define BOOK3E_PAGESZ_2TB 31 - -/* MAS registers bit definitions */ - -#define MAS0_TLBSEL_MASK 0x30000000 -#define MAS0_TLBSEL_SHIFT 28 -#define MAS0_TLBSEL(x) (((x) << MAS0_TLBSEL_SHIFT) & MAS0_TLBSEL_MASK) -#define MAS0_GET_TLBSEL(mas0) (((mas0) & MAS0_TLBSEL_MASK) >> \ - MAS0_TLBSEL_SHIFT) -#define MAS0_ESEL_MASK 0x0FFF0000 -#define MAS0_ESEL_SHIFT 16 -#define MAS0_ESEL(x) (((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK) -#define MAS0_NV(x) ((x) & 0x00000FFF) -#define MAS0_HES 0x00004000 -#define MAS0_WQ_ALLWAYS 0x00000000 -#define MAS0_WQ_COND 0x00001000 -#define MAS0_WQ_CLR_RSRV 0x00002000 - -#define MAS1_VALID 0x80000000 -#define MAS1_IPROT 0x40000000 -#define MAS1_TID(x) (((x) << 16) & 0x3FFF0000) -#define MAS1_IND 0x00002000 -#define MAS1_TS 0x00001000 -#define MAS1_TSIZE_MASK 0x00000f80 -#define MAS1_TSIZE_SHIFT 7 -#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) -#define MAS1_GET_TSIZE(mas1) (((mas1) & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT) - -#define MAS2_EPN (~0xFFFUL) -#define MAS2_X0 0x00000040 -#define MAS2_X1 0x00000020 -#define MAS2_W 0x00000010 -#define MAS2_I 0x00000008 -#define MAS2_M 0x00000004 -#define MAS2_G 0x00000002 -#define MAS2_E 0x00000001 -#define MAS2_WIMGE_MASK 0x0000001f -#define MAS2_EPN_MASK(size) (~0 << (size + 10)) - -#define MAS3_RPN 0xFFFFF000 -#define MAS3_U0 0x00000200 -#define MAS3_U1 0x00000100 -#define MAS3_U2 0x00000080 -#define MAS3_U3 0x00000040 -#define MAS3_UX 0x00000020 -#define MAS3_SX 0x00000010 -#define MAS3_UW 0x00000008 -#define MAS3_SW 0x00000004 -#define MAS3_UR 0x00000002 -#define MAS3_SR 0x00000001 -#define MAS3_BAP_MASK 0x0000003f -#define MAS3_SPSIZE 0x0000003e -#define MAS3_SPSIZE_SHIFT 1 - -#define MAS4_TLBSEL_MASK MAS0_TLBSEL_MASK -#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) -#define MAS4_INDD 0x00008000 /* Default IND */ -#define MAS4_TSIZED(x) MAS1_TSIZE(x) -#define MAS4_X0D 0x00000040 -#define MAS4_X1D 0x00000020 -#define MAS4_WD 0x00000010 -#define MAS4_ID 0x00000008 -#define MAS4_MD 0x00000004 -#define MAS4_GD 0x00000002 -#define MAS4_ED 0x00000001 -#define MAS4_WIMGED_MASK 0x0000001f /* Default WIMGE */ -#define MAS4_WIMGED_SHIFT 0 -#define MAS4_VLED MAS4_X1D /* Default VLE */ -#define MAS4_ACMD 0x000000c0 /* Default ACM */ -#define MAS4_ACMD_SHIFT 6 -#define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ -#define MAS4_TSIZED_SHIFT 7 - -#define MAS5_SGS 0x80000000 - -#define MAS6_SPID0 0x3FFF0000 -#define MAS6_SPID1 0x00007FFE -#define MAS6_ISIZE(x) MAS1_TSIZE(x) -#define MAS6_SAS 0x00000001 -#define MAS6_SPID MAS6_SPID0 -#define MAS6_SIND 0x00000002 /* Indirect page */ -#define MAS6_SIND_SHIFT 1 -#define MAS6_SPID_MASK 0x3fff0000 -#define MAS6_SPID_SHIFT 16 -#define MAS6_ISIZE_MASK 0x00000f80 -#define MAS6_ISIZE_SHIFT 7 - -#define MAS7_RPN 0xFFFFFFFF - -#define MAS8_TGS 0x80000000 /* Guest space */ -#define MAS8_VF 0x40000000 /* Virtualization Fault */ -#define MAS8_TLPID 0x000000ff - -/* Bit definitions for MMUCFG */ -#define MMUCFG_MAVN 0x00000003 /* MMU Architecture Version Number */ -#define MMUCFG_MAVN_V1 0x00000000 /* v1.0 */ -#define MMUCFG_MAVN_V2 0x00000001 /* v2.0 */ -#define MMUCFG_NTLBS 0x0000000c /* Number of TLBs */ -#define MMUCFG_PIDSIZE 0x000007c0 /* PID Reg Size */ -#define MMUCFG_TWC 0x00008000 /* TLB Write Conditional (v2.0) */ -#define MMUCFG_LRAT 0x00010000 /* LRAT Supported (v2.0) */ -#define MMUCFG_RASIZE 0x00fe0000 /* Real Addr Size */ -#define MMUCFG_LPIDSIZE 0x0f000000 /* LPID Reg Size */ - -/* Bit definitions for MMUCSR0 */ -#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ -#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ -#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ -#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ -#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ - MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) -#define MMUCSR0_TLB0PS 0x00000780 /* TLB0 Page Size */ -#define MMUCSR0_TLB1PS 0x00007800 /* TLB1 Page Size */ -#define MMUCSR0_TLB2PS 0x00078000 /* TLB2 Page Size */ -#define MMUCSR0_TLB3PS 0x00780000 /* TLB3 Page Size */ - -/* MMUCFG bits */ -#define MMUCFG_MAVN_NASK 0x00000003 -#define MMUCFG_MAVN_V1_0 0x00000000 -#define MMUCFG_MAVN_V2_0 0x00000001 -#define MMUCFG_NTLB_MASK 0x0000000c -#define MMUCFG_NTLB_SHIFT 2 -#define MMUCFG_PIDSIZE_MASK 0x000007c0 -#define MMUCFG_PIDSIZE_SHIFT 6 -#define MMUCFG_TWC 0x00008000 -#define MMUCFG_LRAT 0x00010000 -#define MMUCFG_RASIZE_MASK 0x00fe0000 -#define MMUCFG_RASIZE_SHIFT 17 -#define MMUCFG_LPIDSIZE_MASK 0x0f000000 -#define MMUCFG_LPIDSIZE_SHIFT 24 - -/* TLBnCFG encoding */ -#define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ -#define TLBnCFG_HES 0x00002000 /* HW select supported */ -#define TLBnCFG_IPROT 0x00008000 /* IPROT supported */ -#define TLBnCFG_GTWE 0x00010000 /* Guest can write */ -#define TLBnCFG_IND 0x00020000 /* IND entries supported */ -#define TLBnCFG_PT 0x00040000 /* Can load from page table */ -#define TLBnCFG_MINSIZE 0x00f00000 /* Minimum Page Size (v1.0) */ -#define TLBnCFG_MINSIZE_SHIFT 20 -#define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */ -#define TLBnCFG_MAXSIZE_SHIFT 16 -#define TLBnCFG_ASSOC 0xff000000 /* Associativity */ -#define TLBnCFG_ASSOC_SHIFT 24 - -/* TLBnPS encoding */ -#define TLBnPS_4K 0x00000004 -#define TLBnPS_8K 0x00000008 -#define TLBnPS_16K 0x00000010 -#define TLBnPS_32K 0x00000020 -#define TLBnPS_64K 0x00000040 -#define TLBnPS_128K 0x00000080 -#define TLBnPS_256K 0x00000100 -#define TLBnPS_512K 0x00000200 -#define TLBnPS_1M 0x00000400 -#define TLBnPS_2M 0x00000800 -#define TLBnPS_4M 0x00001000 -#define TLBnPS_8M 0x00002000 -#define TLBnPS_16M 0x00004000 -#define TLBnPS_32M 0x00008000 -#define TLBnPS_64M 0x00010000 -#define TLBnPS_128M 0x00020000 -#define TLBnPS_256M 0x00040000 -#define TLBnPS_512M 0x00080000 -#define TLBnPS_1G 0x00100000 -#define TLBnPS_2G 0x00200000 -#define TLBnPS_4G 0x00400000 -#define TLBnPS_8G 0x00800000 -#define TLBnPS_16G 0x01000000 -#define TLBnPS_32G 0x02000000 -#define TLBnPS_64G 0x04000000 -#define TLBnPS_128G 0x08000000 -#define TLBnPS_256G 0x10000000 - -/* tlbilx action encoding */ -#define TLBILX_T_ALL 0 -#define TLBILX_T_TID 1 -#define TLBILX_T_FULLMATCH 3 -#define TLBILX_T_CLASS0 4 -#define TLBILX_T_CLASS1 5 -#define TLBILX_T_CLASS2 6 -#define TLBILX_T_CLASS3 7 - -/* - * The mapping only needs to be cache-coherent on SMP, except on - * Freescale e500mc derivatives where it's also needed for coherent DMA. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define MAS2_M_IF_NEEDED MAS2_M -#else -#define MAS2_M_IF_NEEDED 0 -#endif - -#ifndef __ASSEMBLY__ -#include - -extern unsigned int tlbcam_index; - -typedef struct { - unsigned int id; - unsigned int active; - void __user *vdso; -} mm_context_t; - -/* Page size definitions, common between 32 and 64-bit - * - * shift : is the "PAGE_SHIFT" value for that page size - * penc : is the pte encoding mask - * - */ -struct mmu_psize_def -{ - unsigned int shift; /* number of bits */ - unsigned int enc; /* PTE encoding */ - unsigned int ind; /* Corresponding indirect page size shift */ - unsigned int flags; -#define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */ -#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */ -}; -extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; - -static inline int shift_to_mmu_psize(unsigned int shift) -{ - int psize; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) - if (mmu_psize_defs[psize].shift == shift) - return psize; - return -1; -} - -static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) -{ - if (mmu_psize_defs[mmu_psize].shift) - return mmu_psize_defs[mmu_psize].shift; - BUG(); -} - -/* The page sizes use the same names as 64-bit hash but are - * constants - */ -#if defined(CONFIG_PPC_4K_PAGES) -#define mmu_virtual_psize MMU_PAGE_4K -#else -#error Unsupported page size -#endif - -extern int mmu_linear_psize; -extern int mmu_vmemmap_psize; - -struct tlb_core_data { - /* - * Per-core spinlock for e6500 TLB handlers (no tlbsrx.) - * Must be the first struct element. - */ - u8 lock; - - /* For software way selection, as on Freescale TLB1 */ - u8 esel_next, esel_max, esel_first; -}; - -#ifdef CONFIG_PPC64 -extern unsigned long linear_map_top; -extern int book3e_htw_mode; - -#define PPC_HTW_NONE 0 -#define PPC_HTW_IBM 1 -#define PPC_HTW_E6500 2 - -/* - * 64-bit booke platforms don't load the tlb in the tlb miss handler code. - * HUGETLB_NEED_PRELOAD handles this - it causes huge_ptep_set_access_flags to - * return 1, indicating that the tlb requires preloading. - */ -#define HUGETLB_NEED_PRELOAD - -#define mmu_cleanup_all NULL - -#define MAX_PHYSMEM_BITS 44 - -#endif - -#endif /* !__ASSEMBLY__ */ - -#endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h new file mode 100644 index 000000000000..e43a418d3ccd --- /dev/null +++ b/arch/powerpc/include/asm/nohash/mmu-e500.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_MMU_BOOK3E_H_ +#define _ASM_POWERPC_MMU_BOOK3E_H_ +/* + * Freescale Book-E/Book-3e (ISA 2.06+) MMU support + */ + +/* Book-3e defined page sizes */ +#define BOOK3E_PAGESZ_1K 0 +#define BOOK3E_PAGESZ_2K 1 +#define BOOK3E_PAGESZ_4K 2 +#define BOOK3E_PAGESZ_8K 3 +#define BOOK3E_PAGESZ_16K 4 +#define BOOK3E_PAGESZ_32K 5 +#define BOOK3E_PAGESZ_64K 6 +#define BOOK3E_PAGESZ_128K 7 +#define BOOK3E_PAGESZ_256K 8 +#define BOOK3E_PAGESZ_512K 9 +#define BOOK3E_PAGESZ_1M 10 +#define BOOK3E_PAGESZ_2M 11 +#define BOOK3E_PAGESZ_4M 12 +#define BOOK3E_PAGESZ_8M 13 +#define BOOK3E_PAGESZ_16M 14 +#define BOOK3E_PAGESZ_32M 15 +#define BOOK3E_PAGESZ_64M 16 +#define BOOK3E_PAGESZ_128M 17 +#define BOOK3E_PAGESZ_256M 18 +#define BOOK3E_PAGESZ_512M 19 +#define BOOK3E_PAGESZ_1GB 20 +#define BOOK3E_PAGESZ_2GB 21 +#define BOOK3E_PAGESZ_4GB 22 +#define BOOK3E_PAGESZ_8GB 23 +#define BOOK3E_PAGESZ_16GB 24 +#define BOOK3E_PAGESZ_32GB 25 +#define BOOK3E_PAGESZ_64GB 26 +#define BOOK3E_PAGESZ_128GB 27 +#define BOOK3E_PAGESZ_256GB 28 +#define BOOK3E_PAGESZ_512GB 29 +#define BOOK3E_PAGESZ_1TB 30 +#define BOOK3E_PAGESZ_2TB 31 + +/* MAS registers bit definitions */ + +#define MAS0_TLBSEL_MASK 0x30000000 +#define MAS0_TLBSEL_SHIFT 28 +#define MAS0_TLBSEL(x) (((x) << MAS0_TLBSEL_SHIFT) & MAS0_TLBSEL_MASK) +#define MAS0_GET_TLBSEL(mas0) (((mas0) & MAS0_TLBSEL_MASK) >> \ + MAS0_TLBSEL_SHIFT) +#define MAS0_ESEL_MASK 0x0FFF0000 +#define MAS0_ESEL_SHIFT 16 +#define MAS0_ESEL(x) (((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK) +#define MAS0_NV(x) ((x) & 0x00000FFF) +#define MAS0_HES 0x00004000 +#define MAS0_WQ_ALLWAYS 0x00000000 +#define MAS0_WQ_COND 0x00001000 +#define MAS0_WQ_CLR_RSRV 0x00002000 + +#define MAS1_VALID 0x80000000 +#define MAS1_IPROT 0x40000000 +#define MAS1_TID(x) (((x) << 16) & 0x3FFF0000) +#define MAS1_IND 0x00002000 +#define MAS1_TS 0x00001000 +#define MAS1_TSIZE_MASK 0x00000f80 +#define MAS1_TSIZE_SHIFT 7 +#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) +#define MAS1_GET_TSIZE(mas1) (((mas1) & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT) + +#define MAS2_EPN (~0xFFFUL) +#define MAS2_X0 0x00000040 +#define MAS2_X1 0x00000020 +#define MAS2_W 0x00000010 +#define MAS2_I 0x00000008 +#define MAS2_M 0x00000004 +#define MAS2_G 0x00000002 +#define MAS2_E 0x00000001 +#define MAS2_WIMGE_MASK 0x0000001f +#define MAS2_EPN_MASK(size) (~0 << (size + 10)) + +#define MAS3_RPN 0xFFFFF000 +#define MAS3_U0 0x00000200 +#define MAS3_U1 0x00000100 +#define MAS3_U2 0x00000080 +#define MAS3_U3 0x00000040 +#define MAS3_UX 0x00000020 +#define MAS3_SX 0x00000010 +#define MAS3_UW 0x00000008 +#define MAS3_SW 0x00000004 +#define MAS3_UR 0x00000002 +#define MAS3_SR 0x00000001 +#define MAS3_BAP_MASK 0x0000003f +#define MAS3_SPSIZE 0x0000003e +#define MAS3_SPSIZE_SHIFT 1 + +#define MAS4_TLBSEL_MASK MAS0_TLBSEL_MASK +#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) +#define MAS4_INDD 0x00008000 /* Default IND */ +#define MAS4_TSIZED(x) MAS1_TSIZE(x) +#define MAS4_X0D 0x00000040 +#define MAS4_X1D 0x00000020 +#define MAS4_WD 0x00000010 +#define MAS4_ID 0x00000008 +#define MAS4_MD 0x00000004 +#define MAS4_GD 0x00000002 +#define MAS4_ED 0x00000001 +#define MAS4_WIMGED_MASK 0x0000001f /* Default WIMGE */ +#define MAS4_WIMGED_SHIFT 0 +#define MAS4_VLED MAS4_X1D /* Default VLE */ +#define MAS4_ACMD 0x000000c0 /* Default ACM */ +#define MAS4_ACMD_SHIFT 6 +#define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ +#define MAS4_TSIZED_SHIFT 7 + +#define MAS5_SGS 0x80000000 + +#define MAS6_SPID0 0x3FFF0000 +#define MAS6_SPID1 0x00007FFE +#define MAS6_ISIZE(x) MAS1_TSIZE(x) +#define MAS6_SAS 0x00000001 +#define MAS6_SPID MAS6_SPID0 +#define MAS6_SIND 0x00000002 /* Indirect page */ +#define MAS6_SIND_SHIFT 1 +#define MAS6_SPID_MASK 0x3fff0000 +#define MAS6_SPID_SHIFT 16 +#define MAS6_ISIZE_MASK 0x00000f80 +#define MAS6_ISIZE_SHIFT 7 + +#define MAS7_RPN 0xFFFFFFFF + +#define MAS8_TGS 0x80000000 /* Guest space */ +#define MAS8_VF 0x40000000 /* Virtualization Fault */ +#define MAS8_TLPID 0x000000ff + +/* Bit definitions for MMUCFG */ +#define MMUCFG_MAVN 0x00000003 /* MMU Architecture Version Number */ +#define MMUCFG_MAVN_V1 0x00000000 /* v1.0 */ +#define MMUCFG_MAVN_V2 0x00000001 /* v2.0 */ +#define MMUCFG_NTLBS 0x0000000c /* Number of TLBs */ +#define MMUCFG_PIDSIZE 0x000007c0 /* PID Reg Size */ +#define MMUCFG_TWC 0x00008000 /* TLB Write Conditional (v2.0) */ +#define MMUCFG_LRAT 0x00010000 /* LRAT Supported (v2.0) */ +#define MMUCFG_RASIZE 0x00fe0000 /* Real Addr Size */ +#define MMUCFG_LPIDSIZE 0x0f000000 /* LPID Reg Size */ + +/* Bit definitions for MMUCSR0 */ +#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ +#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ +#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ +#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ +#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ + MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) +#define MMUCSR0_TLB0PS 0x00000780 /* TLB0 Page Size */ +#define MMUCSR0_TLB1PS 0x00007800 /* TLB1 Page Size */ +#define MMUCSR0_TLB2PS 0x00078000 /* TLB2 Page Size */ +#define MMUCSR0_TLB3PS 0x00780000 /* TLB3 Page Size */ + +/* MMUCFG bits */ +#define MMUCFG_MAVN_NASK 0x00000003 +#define MMUCFG_MAVN_V1_0 0x00000000 +#define MMUCFG_MAVN_V2_0 0x00000001 +#define MMUCFG_NTLB_MASK 0x0000000c +#define MMUCFG_NTLB_SHIFT 2 +#define MMUCFG_PIDSIZE_MASK 0x000007c0 +#define MMUCFG_PIDSIZE_SHIFT 6 +#define MMUCFG_TWC 0x00008000 +#define MMUCFG_LRAT 0x00010000 +#define MMUCFG_RASIZE_MASK 0x00fe0000 +#define MMUCFG_RASIZE_SHIFT 17 +#define MMUCFG_LPIDSIZE_MASK 0x0f000000 +#define MMUCFG_LPIDSIZE_SHIFT 24 + +/* TLBnCFG encoding */ +#define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ +#define TLBnCFG_HES 0x00002000 /* HW select supported */ +#define TLBnCFG_IPROT 0x00008000 /* IPROT supported */ +#define TLBnCFG_GTWE 0x00010000 /* Guest can write */ +#define TLBnCFG_IND 0x00020000 /* IND entries supported */ +#define TLBnCFG_PT 0x00040000 /* Can load from page table */ +#define TLBnCFG_MINSIZE 0x00f00000 /* Minimum Page Size (v1.0) */ +#define TLBnCFG_MINSIZE_SHIFT 20 +#define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */ +#define TLBnCFG_MAXSIZE_SHIFT 16 +#define TLBnCFG_ASSOC 0xff000000 /* Associativity */ +#define TLBnCFG_ASSOC_SHIFT 24 + +/* TLBnPS encoding */ +#define TLBnPS_4K 0x00000004 +#define TLBnPS_8K 0x00000008 +#define TLBnPS_16K 0x00000010 +#define TLBnPS_32K 0x00000020 +#define TLBnPS_64K 0x00000040 +#define TLBnPS_128K 0x00000080 +#define TLBnPS_256K 0x00000100 +#define TLBnPS_512K 0x00000200 +#define TLBnPS_1M 0x00000400 +#define TLBnPS_2M 0x00000800 +#define TLBnPS_4M 0x00001000 +#define TLBnPS_8M 0x00002000 +#define TLBnPS_16M 0x00004000 +#define TLBnPS_32M 0x00008000 +#define TLBnPS_64M 0x00010000 +#define TLBnPS_128M 0x00020000 +#define TLBnPS_256M 0x00040000 +#define TLBnPS_512M 0x00080000 +#define TLBnPS_1G 0x00100000 +#define TLBnPS_2G 0x00200000 +#define TLBnPS_4G 0x00400000 +#define TLBnPS_8G 0x00800000 +#define TLBnPS_16G 0x01000000 +#define TLBnPS_32G 0x02000000 +#define TLBnPS_64G 0x04000000 +#define TLBnPS_128G 0x08000000 +#define TLBnPS_256G 0x10000000 + +/* tlbilx action encoding */ +#define TLBILX_T_ALL 0 +#define TLBILX_T_TID 1 +#define TLBILX_T_FULLMATCH 3 +#define TLBILX_T_CLASS0 4 +#define TLBILX_T_CLASS1 5 +#define TLBILX_T_CLASS2 6 +#define TLBILX_T_CLASS3 7 + +/* + * The mapping only needs to be cache-coherent on SMP, except on + * Freescale e500mc derivatives where it's also needed for coherent DMA. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) +#define MAS2_M_IF_NEEDED MAS2_M +#else +#define MAS2_M_IF_NEEDED 0 +#endif + +#ifndef __ASSEMBLY__ +#include + +extern unsigned int tlbcam_index; + +typedef struct { + unsigned int id; + unsigned int active; + void __user *vdso; +} mm_context_t; + +/* Page size definitions, common between 32 and 64-bit + * + * shift : is the "PAGE_SHIFT" value for that page size + * penc : is the pte encoding mask + * + */ +struct mmu_psize_def +{ + unsigned int shift; /* number of bits */ + unsigned int enc; /* PTE encoding */ + unsigned int ind; /* Corresponding indirect page size shift */ + unsigned int flags; +#define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */ +#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */ +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +static inline int shift_to_mmu_psize(unsigned int shift) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) + if (mmu_psize_defs[psize].shift == shift) + return psize; + return -1; +} + +static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) +{ + if (mmu_psize_defs[mmu_psize].shift) + return mmu_psize_defs[mmu_psize].shift; + BUG(); +} + +/* The page sizes use the same names as 64-bit hash but are + * constants + */ +#if defined(CONFIG_PPC_4K_PAGES) +#define mmu_virtual_psize MMU_PAGE_4K +#else +#error Unsupported page size +#endif + +extern int mmu_linear_psize; +extern int mmu_vmemmap_psize; + +struct tlb_core_data { + /* + * Per-core spinlock for e6500 TLB handlers (no tlbsrx.) + * Must be the first struct element. + */ + u8 lock; + + /* For software way selection, as on Freescale TLB1 */ + u8 esel_next, esel_max, esel_first; +}; + +#ifdef CONFIG_PPC64 +extern unsigned long linear_map_top; +extern int book3e_htw_mode; + +#define PPC_HTW_NONE 0 +#define PPC_HTW_IBM 1 +#define PPC_HTW_E6500 2 + +/* + * 64-bit booke platforms don't load the tlb in the tlb miss handler code. + * HUGETLB_NEED_PRELOAD handles this - it causes huge_ptep_set_access_flags to + * return 1, indicating that the tlb requires preloading. + */ +#define HUGETLB_NEED_PRELOAD + +#define mmu_cleanup_all NULL + +#define MAX_PHYSMEM_BITS 44 + +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/nohash/mmu.h b/arch/powerpc/include/asm/nohash/mmu.h index edc793e5f08f..e264be219fdb 100644 --- a/arch/powerpc/include/asm/nohash/mmu.h +++ b/arch/powerpc/include/asm/nohash/mmu.h @@ -8,9 +8,9 @@ #elif defined(CONFIG_44x) /* 44x-style software loaded TLB */ #include -#elif defined(CONFIG_PPC_BOOK3E_MMU) +#elif defined(CONFIG_PPC_E500) /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ -#include +#include #elif defined (CONFIG_PPC_8xx) /* Motorola/Freescale 8xx software loaded TLB */ #include diff --git a/arch/powerpc/kernel/cpu_setup_e500.S b/arch/powerpc/kernel/cpu_setup_e500.S index 058336079069..2ab25161b0ad 100644 --- a/arch/powerpc/kernel/cpu_setup_e500.S +++ b/arch/powerpc/kernel/cpu_setup_e500.S @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e6d5fe3a8585..2b5b0677d36c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -488,7 +488,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return) mtspr SPRN_##exc_lvl_srr0,r9; \ mtspr SPRN_##exc_lvl_srr1,r10; -#if defined(CONFIG_PPC_BOOK3E_MMU) +#if defined(CONFIG_PPC_E500) #ifdef CONFIG_PHYS_64BIT #define RESTORE_MAS7 \ lwz r11,MAS7(r1); \ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 1047dc053b47..1cb9d0f7cbf2 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -242,7 +242,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) .macro SAVE_MMU_REGS -#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_E500 mfspr r0,SPRN_MAS0 stw r0,MAS0(r1) mfspr r0,SPRN_MAS1 @@ -257,7 +257,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) mfspr r0,SPRN_MAS7 stw r0,MAS7(r1) #endif /* CONFIG_PHYS_64BIT */ -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ #ifdef CONFIG_44x mfspr r0,SPRN_MMUCR stw r0,MMUCR(r1) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 6568823cf306..5b3c093611ba 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -455,7 +455,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); break; -#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_E500 case KVM_INST_MFSPR(SPRN_MAS0): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt); @@ -484,7 +484,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt); break; -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ case KVM_INST_MFSPR(SPRN_SPRG4): #ifdef CONFIG_BOOKE @@ -557,7 +557,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MTSPR(SPRN_DSISR): kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); break; -#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_E500 case KVM_INST_MTSPR(SPRN_MAS0): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt); @@ -586,7 +586,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt); break; -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ case KVM_INST_MTSPR(SPRN_SPRG4): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index c3ef751465fb..6d0d329cbb35 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -17,7 +17,7 @@ #define KVM_E500_H #include -#include +#include #include #include diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index fcb1e5ae5c55..fac59fbd475a 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -49,7 +49,7 @@ * other sizes not listed here. The .ind field is only used on MMUs that have * indirect page table entries. */ -#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_8xx) #ifdef CONFIG_PPC_E500 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { @@ -142,7 +142,7 @@ static inline int mmu_get_tsize(int psize) /* This isn't used on !Book3E for now */ return 0; } -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index b533caaf0910..dc896d2874f3 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -4,7 +4,7 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_E500) += shared.o obj-$(CONFIG_PPC_BOOK3S_32) += shared.o obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 32c60ad8f45d..1746d19d058f 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -466,10 +466,6 @@ config PPC_MMU_NOHASH def_bool y depends on !PPC_BOOK3S -config PPC_BOOK3E_MMU - def_bool y - depends on PPC_85xx || PPC_BOOK3E_64 - config PPC_HAVE_PMU_SUPPORT bool -- cgit v1.2.3 From 6556fd1a1e9fcd180348c4368d2387bdc6a17613 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Sep 2022 19:01:42 +0200 Subject: powerpc: Cleanup idle for e500 e500 idle setup is a bit messy. e500_idle() is used for PPC32 while book3e_idle() is used for PPC64. As they are mutually exclusive, call them all e500_idle(). Use CONFIG_MPC_85xx instead of PPC32 + E500 in Makefile and rename idle_e500.c to idle_85xx.c . Rename idle_book3e.c to idle_64e.c and remove #ifdef PPC64 in as it's only built on PPC64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8039301334e948974c85ec5ef2db37751075185b.1663606876.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/machdep.h | 1 - arch/powerpc/kernel/Makefile | 6 +- arch/powerpc/kernel/idle_64e.S | 99 +++++++++++++++++++++++++ arch/powerpc/kernel/idle_85xx.S | 85 +++++++++++++++++++++ arch/powerpc/kernel/idle_book3e.S | 103 -------------------------- arch/powerpc/kernel/idle_e500.S | 85 --------------------- arch/powerpc/platforms/85xx/corenet_generic.c | 4 - arch/powerpc/platforms/85xx/qemu_e500.c | 4 - 8 files changed, 186 insertions(+), 201 deletions(-) create mode 100644 arch/powerpc/kernel/idle_64e.S create mode 100644 arch/powerpc/kernel/idle_85xx.S delete mode 100644 arch/powerpc/kernel/idle_book3e.S delete mode 100644 arch/powerpc/kernel/idle_e500.S (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 8cb83600c434..378b8d5836a7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -204,7 +204,6 @@ struct machdep_calls { extern void e500_idle(void); extern void power4_idle(void); extern void ppc6xx_idle(void); -extern void book3e_idle(void); /* * ppc_md contains a copy of the machine description structure for the diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 658c4dffaa56..1f121c188805 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -81,7 +81,7 @@ obj-$(CONFIG_PPC_DAWR) += dawr.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o -obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o +obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_64e.o obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o obj-$(CONFIG_PPC64) += vdso64_wrapper.o obj-$(CONFIG_ALTIVEC) += vecemu.o @@ -100,9 +100,7 @@ obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o -ifdef CONFIG_PPC32 -obj-$(CONFIG_PPC_E500) += idle_e500.o -endif +obj-$(CONFIG_PPC_85xx) += idle_85xx.o obj-$(CONFIG_PPC_BOOK3S_32) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o diff --git a/arch/powerpc/kernel/idle_64e.S b/arch/powerpc/kernel/idle_64e.S new file mode 100644 index 000000000000..1736aad2afe9 --- /dev/null +++ b/arch/powerpc/kernel/idle_64e.S @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2010 IBM Corp, Benjamin Herrenschmidt + * + * Generic idle routine for 64 bits e500 processors + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* 64-bit version only for now */ +.macro BOOK3E_IDLE name loop +_GLOBAL(\name) + /* Save LR for later */ + mflr r0 + std r0,16(r1) + + /* Hard disable interrupts */ + wrteei 0 + + /* Now check if an interrupt came in while we were soft disabled + * since we may otherwise lose it (doorbells etc...). + */ + lbz r3,PACAIRQHAPPENED(r13) + cmpwi cr0,r3,0 + bne 2f + + /* Now we are going to mark ourselves as soft and hard enabled in + * order to be able to take interrupts while asleep. We inform lockdep + * of that. We don't actually turn interrupts on just yet tho. + */ +#ifdef CONFIG_TRACE_IRQFLAGS + stdu r1,-128(r1) + bl trace_hardirqs_on + addi r1,r1,128 +#endif + li r0,IRQS_ENABLED + stb r0,PACAIRQSOFTMASK(r13) + + /* Interrupts will make use return to LR, so get something we want + * in there + */ + bl 1f + + /* And return (interrupts are on) */ + ld r0,16(r1) + mtlr r0 + blr + +1: /* Let's set the _TLF_NAPPING flag so interrupts make us return + * to the right spot + */ + ld r11, PACACURRENT(r13) + ld r10,TI_LOCAL_FLAGS(r11) + ori r10,r10,_TLF_NAPPING + std r10,TI_LOCAL_FLAGS(r11) + + /* We can now re-enable hard interrupts and go to sleep */ + wrteei 1 + \loop + +2: + lbz r10,PACAIRQHAPPENED(r13) + ori r10,r10,PACA_IRQ_HARD_DIS + stb r10,PACAIRQHAPPENED(r13) + blr +.endm + +.macro BOOK3E_IDLE_LOOP +1: + PPC_WAIT(0) + b 1b +.endm + +/* epapr_ev_idle_start below is patched with the proper hcall + opcodes during kernel initialization */ +.macro EPAPR_EV_IDLE_LOOP +idle_loop: + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) + +.global epapr_ev_idle_start +epapr_ev_idle_start: + li r3, -1 + nop + nop + nop + b idle_loop +.endm + +BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP + +BOOK3E_IDLE e500_idle BOOK3E_IDLE_LOOP diff --git a/arch/powerpc/kernel/idle_85xx.S b/arch/powerpc/kernel/idle_85xx.S new file mode 100644 index 000000000000..9e1bc4502c50 --- /dev/null +++ b/arch/powerpc/kernel/idle_85xx.S @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Dave Liu + * copy from idle_6xx.S and modify for e500 based processor, + * implement the power_save function in idle. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + .text + +_GLOBAL(e500_idle) + lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */ + ori r4,r4,_TLF_NAPPING /* so when we take an exception */ + stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ + +#ifdef CONFIG_PPC_E500MC + wrteei 1 +1: wait + + /* + * Guard against spurious wakeups (e.g. from a hypervisor) -- + * any real interrupt will cause us to return to LR due to + * _TLF_NAPPING. + */ + b 1b +#else + /* Check if we can nap or doze, put HID0 mask in r3 */ + lis r3,0 +BEGIN_FTR_SECTION + lis r3,HID0_DOZE@h +END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) + +BEGIN_FTR_SECTION + /* Now check if user enabled NAP mode */ + lis r4,powersave_nap@ha + lwz r4,powersave_nap@l(r4) + cmpwi 0,r4,0 + beq 1f + stwu r1,-16(r1) + mflr r0 + stw r0,20(r1) + bl flush_dcache_L1 + lwz r0,20(r1) + addi r1,r1,16 + mtlr r0 + lis r3,HID0_NAP@h +END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) +1: + /* Go to NAP or DOZE now */ + mfspr r4,SPRN_HID0 + rlwinm r4,r4,0,~(HID0_DOZE|HID0_NAP|HID0_SLEEP) + or r4,r4,r3 + isync + mtspr SPRN_HID0,r4 + isync + + mfmsr r7 + oris r7,r7,MSR_WE@h + ori r7,r7,MSR_EE + msync + mtmsr r7 + isync +2: b 2b +#endif /* !E500MC */ + +/* + * Return from NAP/DOZE mode, restore some CPU specific registers, + * r2 containing address of current. + * r11 points to the exception frame. + * We have to preserve r10. + */ +_GLOBAL(power_save_ppc32_restore) + lwz r9,_LINK(r11) /* interrupted in e500_idle */ + stw r9,_NIP(r11) /* make it do a blr */ + blr +_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S deleted file mode 100644 index cc008de58b05..000000000000 --- a/arch/powerpc/kernel/idle_book3e.S +++ /dev/null @@ -1,103 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2010 IBM Corp, Benjamin Herrenschmidt - * - * Generic idle routine for Book3E processors - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* 64-bit version only for now */ -#ifdef CONFIG_PPC64 - -.macro BOOK3E_IDLE name loop -_GLOBAL(\name) - /* Save LR for later */ - mflr r0 - std r0,16(r1) - - /* Hard disable interrupts */ - wrteei 0 - - /* Now check if an interrupt came in while we were soft disabled - * since we may otherwise lose it (doorbells etc...). - */ - lbz r3,PACAIRQHAPPENED(r13) - cmpwi cr0,r3,0 - bne 2f - - /* Now we are going to mark ourselves as soft and hard enabled in - * order to be able to take interrupts while asleep. We inform lockdep - * of that. We don't actually turn interrupts on just yet tho. - */ -#ifdef CONFIG_TRACE_IRQFLAGS - stdu r1,-128(r1) - bl trace_hardirqs_on - addi r1,r1,128 -#endif - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13) - - /* Interrupts will make use return to LR, so get something we want - * in there - */ - bl 1f - - /* And return (interrupts are on) */ - ld r0,16(r1) - mtlr r0 - blr - -1: /* Let's set the _TLF_NAPPING flag so interrupts make us return - * to the right spot - */ - ld r11, PACACURRENT(r13) - ld r10,TI_LOCAL_FLAGS(r11) - ori r10,r10,_TLF_NAPPING - std r10,TI_LOCAL_FLAGS(r11) - - /* We can now re-enable hard interrupts and go to sleep */ - wrteei 1 - \loop - -2: - lbz r10,PACAIRQHAPPENED(r13) - ori r10,r10,PACA_IRQ_HARD_DIS - stb r10,PACAIRQHAPPENED(r13) - blr -.endm - -.macro BOOK3E_IDLE_LOOP -1: - PPC_WAIT(0) - b 1b -.endm - -/* epapr_ev_idle_start below is patched with the proper hcall - opcodes during kernel initialization */ -.macro EPAPR_EV_IDLE_LOOP -idle_loop: - LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) - -.global epapr_ev_idle_start -epapr_ev_idle_start: - li r3, -1 - nop - nop - nop - b idle_loop -.endm - -BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP - -BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP - -#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S deleted file mode 100644 index 9e1bc4502c50..000000000000 --- a/arch/powerpc/kernel/idle_e500.S +++ /dev/null @@ -1,85 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. - * Dave Liu - * copy from idle_6xx.S and modify for e500 based processor, - * implement the power_save function in idle. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - - .text - -_GLOBAL(e500_idle) - lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */ - ori r4,r4,_TLF_NAPPING /* so when we take an exception */ - stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ - -#ifdef CONFIG_PPC_E500MC - wrteei 1 -1: wait - - /* - * Guard against spurious wakeups (e.g. from a hypervisor) -- - * any real interrupt will cause us to return to LR due to - * _TLF_NAPPING. - */ - b 1b -#else - /* Check if we can nap or doze, put HID0 mask in r3 */ - lis r3,0 -BEGIN_FTR_SECTION - lis r3,HID0_DOZE@h -END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) - -BEGIN_FTR_SECTION - /* Now check if user enabled NAP mode */ - lis r4,powersave_nap@ha - lwz r4,powersave_nap@l(r4) - cmpwi 0,r4,0 - beq 1f - stwu r1,-16(r1) - mflr r0 - stw r0,20(r1) - bl flush_dcache_L1 - lwz r0,20(r1) - addi r1,r1,16 - mtlr r0 - lis r3,HID0_NAP@h -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) -1: - /* Go to NAP or DOZE now */ - mfspr r4,SPRN_HID0 - rlwinm r4,r4,0,~(HID0_DOZE|HID0_NAP|HID0_SLEEP) - or r4,r4,r3 - isync - mtspr SPRN_HID0,r4 - isync - - mfmsr r7 - oris r7,r7,MSR_WE@h - ori r7,r7,MSR_EE - msync - mtmsr r7 - isync -2: b 2b -#endif /* !E500MC */ - -/* - * Return from NAP/DOZE mode, restore some CPU specific registers, - * r2 containing address of current. - * r11 points to the exception frame. - * We have to preserve r10. - */ -_GLOBAL(power_save_ppc32_restore) - lwz r9,_LINK(r11) /* interrupted in e500_idle */ - stw r9,_NIP(r11) /* make it do a blr */ - blr -_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index 28d6b36f1ccd..2c539de2d629 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -200,9 +200,5 @@ define_machine(corenet_generic) { #endif .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, -#ifdef CONFIG_PPC64 - .power_save = book3e_idle, -#else .power_save = e500_idle, -#endif }; diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c index 64109ad6736c..1639e222cc33 100644 --- a/arch/powerpc/platforms/85xx/qemu_e500.c +++ b/arch/powerpc/platforms/85xx/qemu_e500.c @@ -68,9 +68,5 @@ define_machine(qemu_e500) { .get_irq = mpic_get_coreint_irq, .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, -#ifdef CONFIG_PPC64 - .power_save = book3e_idle, -#else .power_save = e500_idle, -#endif }; -- cgit v1.2.3 From 4af83545538a4fa80d14b9247ffc0db556e6a556 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Sep 2022 08:41:08 +0200 Subject: powerpc/irq: Refactor irq_soft_mask_{set,or}_return() This partialy reapply commit ef5b570d3700 ("powerpc/irq: Don't open code irq_soft_mask helpers") which was reverted by commit 684c68d92e2e ("Revert "powerpc/irq: Don't open code irq_soft_mask helpers"") irq_soft_mask_set_return() and irq_soft_mask_or_return() are overset of irq_soft_mask_set(). Have them use irq_soft_mask_set() instead of duplicating it. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/18473da42362ee8f07bce36b9caef8ba77d7633f.1663656054.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 983551859891..e8de249339d8 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -157,36 +157,18 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) { - unsigned long flags; - -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON(mask && !(mask & IRQS_DISABLED)); -#endif + unsigned long flags = irq_soft_mask_return(); - asm volatile( - "lbz %0,%1(13); stb %2,%1(13)" - : "=&r" (flags) - : "i" (offsetof(struct paca_struct, irq_soft_mask)), - "r" (mask) - : "memory"); + irq_soft_mask_set(mask); return flags; } static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) { - unsigned long flags, tmp; + unsigned long flags = irq_soft_mask_return(); - asm volatile( - "lbz %0,%2(13); or %1,%0,%3; stb %1,%2(13)" - : "=&r" (flags), "=r" (tmp) - : "i" (offsetof(struct paca_struct, irq_soft_mask)), - "r" (mask) - : "memory"); - -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON((mask | flags) && !((mask | flags) & IRQS_DISABLED)); -#endif + irq_soft_mask_set(flags | mask); return flags; } -- cgit v1.2.3 From 5ba6c9a912fe4c60f84d6617ad10d2b8d7910990 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:41 +1000 Subject: powerpc: Remove asmlinkage from syscall handler definitions The asmlinkage macro has no special meaning in powerpc, and prior to this patch is used sporadically on some syscall handler definitions. On architectures that do not define asmlinkage, it resolves to extern "C" for C++ compilers and a nop otherwise. The current invocations of asmlinkage provide far from complete support for C++ toolchains, and so the macro serves no purpose in powerpc. Remove all invocations of asmlinkage in arch/powerpc. These incidentally only occur in syscall definitions and prototypes. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-2-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 16 ++++++++-------- arch/powerpc/kernel/sys_ppc32.c | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index a2b13e55254f..21c2faaa2957 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -10,14 +10,14 @@ struct rtas_args; -asmlinkage long sys_mmap(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset); -asmlinkage long sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage long ppc64_personality(unsigned long personality); -asmlinkage long sys_rtas(struct rtas_args __user *uargs); +long sys_mmap(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, off_t offset); +long sys_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); +long ppc64_personality(unsigned long personality); +long sys_rtas(struct rtas_args __user *uargs); int ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp); long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 16ff0399a257..f4edcc9489fb 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -85,20 +85,20 @@ compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u3 return ksys_readahead(fd, merge_64(offset1, offset2), count); } -asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4, +int compat_sys_truncate64(const char __user * path, u32 reg4, unsigned long len1, unsigned long len2) { return ksys_truncate(path, merge_64(len1, len2)); } -asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, +long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2) { return ksys_fallocate(fd, mode, ((loff_t)offset1 << 32) | offset2, merge_64(len1, len2)); } -asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, +int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2) { return ksys_ftruncate(fd, merge_64(len1, len2)); @@ -111,7 +111,7 @@ long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, advice); } -asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, +long compat_sys_sync_file_range2(int fd, unsigned int flags, unsigned offset1, unsigned offset2, unsigned nbytes1, unsigned nbytes2) { -- cgit v1.2.3 From 9d54a5ce3aa87810f13cd33b314097ac6d28c350 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:43 +1000 Subject: powerpc: Add ZEROIZE_GPRS macros for register clears Provide register zeroing macros, following the same convention as existing register stack save/restore macros, to be used in later change to concisely zero a sequence of consecutive gprs. The resulting macros are called ZEROIZE_GPRS and ZEROIZE_NVGPRS, keeping with the naming of the accompanying restore and save macros, and usage of zeroize to describe this operation elsewhere in the kernel. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-4-rmclure@linux.ibm.com --- arch/powerpc/include/asm/ppc_asm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 7e4fe766e247..eeb7dc8cd45f 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -33,6 +33,20 @@ .endr .endm +/* + * This expands to a sequence of register clears for regs start to end + * inclusive, of the form: + * + * li rN, 0 + */ +.macro ZEROIZE_REGS start, end + .Lreg=\start + .rept (\end - \start + 1) + li .Lreg, 0 + .Lreg=.Lreg+1 + .endr +.endm + /* * Macros for storing registers into and loading registers from * exception frames. @@ -49,6 +63,14 @@ #define REST_NVGPRS(base) REST_GPRS(13, 31, base) #endif +#define ZEROIZE_GPRS(start, end) ZEROIZE_REGS start, end +#ifdef __powerpc64__ +#define ZEROIZE_NVGPRS() ZEROIZE_GPRS(14, 31) +#else +#define ZEROIZE_NVGPRS() ZEROIZE_GPRS(13, 31) +#endif +#define ZEROIZE_GPR(n) ZEROIZE_GPRS(n, n) + #define SAVE_GPR(n, base) SAVE_GPRS(n, n, base) #define REST_GPR(n, base) REST_GPRS(n, n, base) -- cgit v1.2.3 From 016ff72bd2090903715c0f9422a44afbb966f4ee Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:48 +1000 Subject: powerpc: Fix fallocate and fadvise64_64 compat parameter combination As reported[1] by Arnd, the arch-specific fadvise64_64 and fallocate compatibility handlers assume parameters are passed with 32-bit big-endian ABI. This affects the assignment of odd-even parameter pairs to the high or low words of a 64-bit syscall parameter. Fix fadvise64_64 fallocate compat handlers to correctly swap upper/lower 32 bits conditioned on endianness. A future patch will replace the arch-specific compat fallocate with an asm-generic implementation. This patch is intended for ease of back-port. [1]: https://lore.kernel.org/all/be29926f-226e-48dc-871a-e29a54e80583@www.fastmail.com/ Fixes: 57f48b4b74e7 ("powerpc/compat_sys: swap hi/lo parts of 64-bit syscall args in LE mode") Reported-by: Arnd Bergmann Signed-off-by: Rohan McLure Reviewed-by: Arnd Bergmann Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-9-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 12 ++++++++++++ arch/powerpc/kernel/sys_ppc32.c | 14 +------------- arch/powerpc/kernel/syscalls.c | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 21c2faaa2957..ae7ca59f6267 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -8,6 +8,18 @@ #include #include +/* + * long long munging: + * The 32 bit ABI passes long longs in an odd even register pair. + * High and low parts are swapped depending on endian mode, + * so define a macro (similar to mips linux32) to handle that. + */ +#ifdef __LITTLE_ENDIAN__ +#define merge_64(low, high) (((u64)high << 32) | low) +#else +#define merge_64(high, low) (((u64)high << 32) | low) +#endif + struct rtas_args; long sys_mmap(unsigned long addr, size_t len, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index f4edcc9489fb..ba363328da2b 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -56,18 +56,6 @@ unsigned long compat_sys_mmap2(unsigned long addr, size_t len, return sys_mmap(addr, len, prot, flags, fd, pgoff << 12); } -/* - * long long munging: - * The 32 bit ABI passes long longs in an odd even register pair. - * High and low parts are swapped depending on endian mode, - * so define a macro (similar to mips linux32) to handle that. - */ -#ifdef __LITTLE_ENDIAN__ -#define merge_64(low, high) ((u64)high << 32) | low -#else -#define merge_64(high, low) ((u64)high << 32) | low -#endif - compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 reg6, u32 pos1, u32 pos2) { @@ -94,7 +82,7 @@ int compat_sys_truncate64(const char __user * path, u32 reg4, long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2) { - return ksys_fallocate(fd, mode, ((loff_t)offset1 << 32) | offset2, + return ksys_fallocate(fd, mode, merge_64(offset1, offset2), merge_64(len1, len2)); } diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index fc999140bc27..abc3fbb3c490 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -98,8 +98,8 @@ long ppc64_personality(unsigned long personality) long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low) { - return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, - (u64)len_high << 32 | len_low, advice); + return ksys_fadvise64_64(fd, merge_64(offset_high, offset_low), + merge_64(len_high, len_low), advice); } SYSCALL_DEFINE0(switch_endian) -- cgit v1.2.3 From c2e7a19827eec443a7cbe85e8d959052412d6dc3 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:50 +1000 Subject: powerpc: Use generic fallocate compatibility syscall The powerpc fallocate compat syscall handler is identical to the generic implementation provided by commit 59c10c52f573f ("riscv: compat: syscall: Add compat_sys_call_table implementation"), and as such can be removed in favour of the generic implementation. A future patch series will replace more architecture-defined syscall handlers with generic implementations, dependent on introducing generic implementations that are compatible with powerpc and arm's parameter reorderings. Reported-by: Arnd Bergmann Signed-off-by: Rohan McLure Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-11-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 2 -- arch/powerpc/include/asm/unistd.h | 1 + arch/powerpc/kernel/sys_ppc32.c | 7 ------- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index ae7ca59f6267..52f5e1985989 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -51,8 +51,6 @@ compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u3 int compat_sys_truncate64(const char __user *path, u32 reg4, unsigned long len1, unsigned long len2); -long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2); - int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2); diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b1129b4ef57d..659a996c75aa 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -45,6 +45,7 @@ #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_NEWFSTATAT #define __ARCH_WANT_COMPAT_STAT +#define __ARCH_WANT_COMPAT_FALLOCATE #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif #define __ARCH_WANT_SYS_FORK diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index ba363328da2b..d961634976d8 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -79,13 +79,6 @@ int compat_sys_truncate64(const char __user * path, u32 reg4, return ksys_truncate(path, merge_64(len1, len2)); } -long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, - u32 len1, u32 len2) -{ - return ksys_fallocate(fd, mode, merge_64(offset1, offset2), - merge_64(len1, len2)); -} - int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2) { -- cgit v1.2.3 From b6b1334c9510e162bd8ca0ae58403cafad9572f1 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:51 +1000 Subject: powerpc/32: Remove powerpc select specialisation Syscall #82 has been implemented for 32-bit platforms in a unique way on powerpc systems. This hack will in effect guess whether the caller is expecting new select semantics or old select semantics. It does so via a guess, based off the first parameter. In new select, this parameter represents the length of a user-memory array of file descriptors, and in old select this is a pointer to an arguments structure. The heuristic simply interprets sufficiently large values of its first parameter as being a call to old select. The following is a discussion on how this syscall should be handled. As discussed in this thread, the existence of such a hack suggests that for whatever powerpc binaries may predate glibc, it is most likely that they would have taken use of the old select semantics. x86 and arm64 both implement this syscall with oldselect semantics. Remove the powerpc implementation, and update syscall.tbl to refer to emit a reference to sys_old_select and compat_sys_old_select for 32-bit binaries, in keeping with how other architectures support syscall #82. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/lkml/13737de5-0eb7-e881-9af0-163b0d29a1a0@csgroup.eu/ Link: https://lore.kernel.org/r/20220921065605.1051927-12-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 2 -- arch/powerpc/kernel/syscalls.c | 17 ----------------- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- 4 files changed, 2 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 52f5e1985989..565b4b1d7d41 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -30,8 +30,6 @@ long sys_mmap2(unsigned long addr, size_t len, unsigned long fd, unsigned long pgoff); long ppc64_personality(unsigned long personality); long sys_rtas(struct rtas_args __user *uargs); -int ppc_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct __kernel_old_timeval __user *tvp); long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index abc3fbb3c490..34e1ae88e15b 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -63,23 +63,6 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT); } -#ifdef CONFIG_PPC32 -/* - * Due to some executables calling the wrong select we sometimes - * get wrong args. This determines how the args are being passed - * (a single ptr to them all args passed) then calls - * sys_select() with the appropriate args. -- Cort - */ -int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) -{ - if ((unsigned long)n >= 4096) - return sys_old_select((void __user *)n); - - return sys_select(n, inp, outp, exp, tvp); -} -#endif - #ifdef CONFIG_PPC64 long ppc64_personality(unsigned long personality) { diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 2600b4237292..64f27cbbdd2c 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -110,7 +110,7 @@ 79 common settimeofday sys_settimeofday compat_sys_settimeofday 80 common getgroups sys_getgroups 81 common setgroups sys_setgroups -82 32 select ppc_select sys_ni_syscall +82 32 select sys_old_select compat_sys_old_select 82 64 select sys_ni_syscall 82 spu select sys_ni_syscall 83 common symlink sys_symlink diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 2600b4237292..64f27cbbdd2c 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -110,7 +110,7 @@ 79 common settimeofday sys_settimeofday compat_sys_settimeofday 80 common getgroups sys_getgroups 81 common setgroups sys_setgroups -82 32 select ppc_select sys_ni_syscall +82 32 select sys_old_select compat_sys_old_select 82 64 select sys_ni_syscall 82 spu select sys_ni_syscall 83 common symlink sys_symlink -- cgit v1.2.3 From b7fa9ce86d32baf2a3a8bf8fdaa44870084edd85 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:53 +1000 Subject: powerpc: Remove direct call to mmap2 syscall handlers Syscall handlers should not be invoked internally by their symbol names, as these symbols defined by the architecture-defined SYSCALL_DEFINE macro. Move the compatibility syscall definition for mmap2 to syscalls.c, so that all mmap implementations can share a helper function. Remove 'inline' on static mmap helper. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin [mpe: Fix compat_sys_mmap2() prototype and offset handling] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-14-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 2 +- arch/powerpc/kernel/sys_ppc32.c | 9 --------- arch/powerpc/kernel/syscalls.c | 16 +++++++++++++--- 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 565b4b1d7d41..6610dc8d078a 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -34,7 +34,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); #ifdef CONFIG_COMPAT -unsigned long compat_sys_mmap2(unsigned long addr, size_t len, +long compat_sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index d961634976d8..776ae7565fc5 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -48,14 +47,6 @@ #include #include -unsigned long compat_sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - /* This should remain 12 even if PAGE_SIZE changes */ - return sys_mmap(addr, len, prot, flags, fd, pgoff << 12); -} - compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 reg6, u32 pos1, u32 pos2) { diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index a04c97faa21a..e84a523cd65e 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -36,9 +36,9 @@ #include #include -static inline long do_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off, int shift) +static long do_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off, int shift) { if (!arch_validate_prot(prot, addr)) return -EINVAL; @@ -56,6 +56,16 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(mmap2, + unsigned long, addr, size_t, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off_4k) +{ + return do_mmap2(addr, len, prot, flags, fd, off_4k, PAGE_SHIFT-12); +} +#endif + SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) -- cgit v1.2.3 From dec20c50df79cadaff17e964ef7f622491a52134 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:55 +1000 Subject: powerpc: Adopt SYSCALL_DEFINE for arch-specific syscall handlers Arch-specific implementations of syscall handlers are currently used over generic implementations for the following reasons: 1. Semantics unique to powerpc 2. Compatibility syscalls require 'argument padding' to comply with 64-bit argument convention in ELF32 abi. 3. Parameter types or order is different in other architectures. These syscall handlers have been defined prior to this patch series without invoking the SYSCALL_DEFINE or COMPAT_SYSCALL_DEFINE macros with custom input and output types. We remove every such direct definition in favour of the aforementioned macros. Also update syscalls.tbl in order to refer to the symbol names generated by each of these macros. Since ppc64_personality can be called by both 64 bit and 32 bit binaries through compatibility, we must generate both both compat_sys_ and sys_ symbols for this handler. As an aside: A number of architectures including arm and powerpc agree on an alternative argument order and numbering for most of these arch-specific handlers. A future patch series may allow for asm/unistd.h to signal through its defines that a generic implementation of these syscall handlers with the correct calling convention be emitted, through the __ARCH_WANT_COMPAT_SYS_... convention. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-16-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 10 +++--- arch/powerpc/kernel/sys_ppc32.c | 38 ++++++++++++++-------- arch/powerpc/kernel/syscalls.c | 17 +++++++--- arch/powerpc/kernel/syscalls/syscall.tbl | 22 ++++++------- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 22 ++++++------- 5 files changed, 64 insertions(+), 45 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 6610dc8d078a..8e1787ff6c93 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -28,10 +28,10 @@ long sys_mmap(unsigned long addr, size_t len, long sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); -long ppc64_personality(unsigned long personality); +long sys_ppc64_personality(unsigned long personality); long sys_rtas(struct rtas_args __user *uargs); -long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, - u32 len_high, u32 len_low); +long sys_ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, + u32 len_high, u32 len_low); #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, @@ -52,8 +52,8 @@ int compat_sys_truncate64(const char __user *path, u32 reg4, int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2); -long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, - size_t len, int advice); +long compat_sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); long compat_sys_sync_file_range2(int fd, unsigned int flags, unsigned int offset1, unsigned int offset2, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 776ae7565fc5..dcc3c9fd4cfd 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -47,45 +47,55 @@ #include #include -compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, - u32 reg6, u32 pos1, u32 pos2) +COMPAT_SYSCALL_DEFINE6(ppc_pread64, + unsigned int, fd, + char __user *, ubuf, compat_size_t, count, + u32, reg6, u32, pos1, u32, pos2) { return ksys_pread64(fd, ubuf, count, merge_64(pos1, pos2)); } -compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, - u32 reg6, u32 pos1, u32 pos2) +COMPAT_SYSCALL_DEFINE6(ppc_pwrite64, + unsigned int, fd, + const char __user *, ubuf, compat_size_t, count, + u32, reg6, u32, pos1, u32, pos2) { return ksys_pwrite64(fd, ubuf, count, merge_64(pos1, pos2)); } -compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count) +COMPAT_SYSCALL_DEFINE5(ppc_readahead, + int, fd, u32, r4, + u32, offset1, u32, offset2, u32, count) { return ksys_readahead(fd, merge_64(offset1, offset2), count); } -int compat_sys_truncate64(const char __user * path, u32 reg4, - unsigned long len1, unsigned long len2) +COMPAT_SYSCALL_DEFINE4(ppc_truncate64, + const char __user *, path, u32, reg4, + unsigned long, len1, unsigned long, len2) { return ksys_truncate(path, merge_64(len1, len2)); } -int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, - unsigned long len2) +COMPAT_SYSCALL_DEFINE4(ppc_ftruncate64, + unsigned int, fd, u32, reg4, + unsigned long, len1, unsigned long, len2) { return ksys_ftruncate(fd, merge_64(len1, len2)); } -long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, - size_t len, int advice) +COMPAT_SYSCALL_DEFINE6(ppc32_fadvise64, + int, fd, u32, unused, u32, offset1, u32, offset2, + size_t, len, int, advice) { return ksys_fadvise64_64(fd, merge_64(offset1, offset2), len, advice); } -long compat_sys_sync_file_range2(int fd, unsigned int flags, - unsigned offset1, unsigned offset2, - unsigned nbytes1, unsigned nbytes2) +COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, + int, fd, unsigned int, flags, + unsigned int, offset1, unsigned int, offset2, + unsigned int, nbytes1, unsigned int, nbytes2) { loff_t offset = merge_64(offset1, offset2); loff_t nbytes = merge_64(nbytes1, nbytes2); diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 8d4b7f6d7cf3..68ebb23a5af4 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -86,14 +86,23 @@ static long do_ppc64_personality(unsigned long personality) ret = (ret & ~PER_MASK) | PER_LINUX; return ret; } -long ppc64_personality(unsigned long personality) + +SYSCALL_DEFINE1(ppc64_personality, unsigned long, personality) { return do_ppc64_personality(personality); } -#endif -long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, - u32 len_high, u32 len_low) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE1(ppc64_personality, unsigned long, personality) +{ + return do_ppc64_personality(personality); +} +#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_PPC64 */ + +SYSCALL_DEFINE6(ppc_fadvise64_64, + int, fd, int, advice, u32, offset_high, u32, offset_low, + u32, len_high, u32, len_low) { return ksys_fadvise64_64(fd, merge_64(offset_high, offset_low), merge_64(len_high, len_low), advice); diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 64f27cbbdd2c..2bca64f96164 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -178,9 +178,9 @@ 133 common fchdir sys_fchdir 134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs -136 32 personality sys_personality ppc64_personality -136 64 personality ppc64_personality -136 spu personality ppc64_personality +136 32 personality sys_personality compat_sys_ppc64_personality +136 64 personality sys_ppc64_personality +136 spu personality sys_ppc64_personality 137 common afs_syscall sys_ni_syscall 138 common setfsuid sys_setfsuid 139 common setfsgid sys_setfsgid @@ -228,8 +228,8 @@ 176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -179 common pread64 sys_pread64 compat_sys_pread64 -180 common pwrite64 sys_pwrite64 compat_sys_pwrite64 +179 common pread64 sys_pread64 compat_sys_ppc_pread64 +180 common pwrite64 sys_pwrite64 compat_sys_ppc_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -242,10 +242,10 @@ 188 common putpmsg sys_ni_syscall 189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit -191 common readahead sys_readahead compat_sys_readahead +191 common readahead sys_readahead compat_sys_ppc_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 -193 32 truncate64 sys_truncate64 compat_sys_truncate64 -194 32 ftruncate64 sys_ftruncate64 compat_sys_ftruncate64 +193 32 truncate64 sys_truncate64 compat_sys_ppc_truncate64 +194 32 ftruncate64 sys_ftruncate64 compat_sys_ppc_ftruncate64 195 32 stat64 sys_stat64 196 32 lstat64 sys_lstat64 197 32 fstat64 sys_fstat64 @@ -288,7 +288,7 @@ 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address -233 common fadvise64 sys_fadvise64 ppc32_fadvise64 +233 common fadvise64 sys_fadvise64 compat_sys_ppc32_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie 236 common epoll_create sys_epoll_create @@ -323,7 +323,7 @@ 251 spu utimes sys_utimes 252 common statfs64 sys_statfs64 compat_sys_statfs64 253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -254 32 fadvise64_64 ppc_fadvise64_64 +254 32 fadvise64_64 sys_ppc_fadvise64_64 254 spu fadvise64_64 sys_ni_syscall 255 common rtas sys_rtas 256 32 sys_debug_setcontext sys_debug_setcontext sys_ni_syscall @@ -390,7 +390,7 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 +308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2 309 nospu fallocate sys_fallocate compat_sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 64f27cbbdd2c..2bca64f96164 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -178,9 +178,9 @@ 133 common fchdir sys_fchdir 134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs -136 32 personality sys_personality ppc64_personality -136 64 personality ppc64_personality -136 spu personality ppc64_personality +136 32 personality sys_personality compat_sys_ppc64_personality +136 64 personality sys_ppc64_personality +136 spu personality sys_ppc64_personality 137 common afs_syscall sys_ni_syscall 138 common setfsuid sys_setfsuid 139 common setfsgid sys_setfsgid @@ -228,8 +228,8 @@ 176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -179 common pread64 sys_pread64 compat_sys_pread64 -180 common pwrite64 sys_pwrite64 compat_sys_pwrite64 +179 common pread64 sys_pread64 compat_sys_ppc_pread64 +180 common pwrite64 sys_pwrite64 compat_sys_ppc_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -242,10 +242,10 @@ 188 common putpmsg sys_ni_syscall 189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit -191 common readahead sys_readahead compat_sys_readahead +191 common readahead sys_readahead compat_sys_ppc_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 -193 32 truncate64 sys_truncate64 compat_sys_truncate64 -194 32 ftruncate64 sys_ftruncate64 compat_sys_ftruncate64 +193 32 truncate64 sys_truncate64 compat_sys_ppc_truncate64 +194 32 ftruncate64 sys_ftruncate64 compat_sys_ppc_ftruncate64 195 32 stat64 sys_stat64 196 32 lstat64 sys_lstat64 197 32 fstat64 sys_fstat64 @@ -288,7 +288,7 @@ 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address -233 common fadvise64 sys_fadvise64 ppc32_fadvise64 +233 common fadvise64 sys_fadvise64 compat_sys_ppc32_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie 236 common epoll_create sys_epoll_create @@ -323,7 +323,7 @@ 251 spu utimes sys_utimes 252 common statfs64 sys_statfs64 compat_sys_statfs64 253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -254 32 fadvise64_64 ppc_fadvise64_64 +254 32 fadvise64_64 sys_ppc_fadvise64_64 254 spu fadvise64_64 sys_ni_syscall 255 common rtas sys_rtas 256 32 sys_debug_setcontext sys_debug_setcontext sys_ni_syscall @@ -390,7 +390,7 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 +308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2 309 nospu fallocate sys_fallocate compat_sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 -- cgit v1.2.3 From 8cd1def4b8e4a592949509fac443e850da8428d0 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:56 +1000 Subject: powerpc: Include all arch-specific syscall prototypes Forward declare all syscall handler prototypes where a generic prototype is not provided in either linux/syscalls.h or linux/compat.h in asm/syscalls.h. This is required for compile-time type-checking for syscall handlers, which is implemented later in this series. 32-bit compatibility syscall handlers are expressed in terms of types in ppc32.h. Expose this header globally. Signed-off-by: Rohan McLure Acked-by: Nicholas Piggin [mpe: Use standard include guard naming for syscalls_32.h] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-17-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscalls.h | 97 ++++++++++++++++++++++++++-------- arch/powerpc/include/asm/syscalls_32.h | 60 +++++++++++++++++++++ arch/powerpc/kernel/ppc32.h | 60 --------------------- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/perf/callchain_32.c | 2 +- 5 files changed, 137 insertions(+), 84 deletions(-) create mode 100644 arch/powerpc/include/asm/syscalls_32.h delete mode 100644 arch/powerpc/kernel/ppc32.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 8e1787ff6c93..7911738ad4a9 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -8,6 +8,14 @@ #include #include +#ifdef CONFIG_PPC64 +#include +#endif +#include +#include + +struct rtas_args; + /* * long long munging: * The 32 bit ABI passes long longs in an odd even register pair. @@ -20,44 +28,89 @@ #define merge_64(high, low) (((u64)high << 32) | low) #endif -struct rtas_args; +long sys_ni_syscall(void); + +/* + * PowerPC architecture-specific syscalls + */ + +long sys_rtas(struct rtas_args __user *uargs); +#ifdef CONFIG_PPC64 +long sys_ppc64_personality(unsigned long personality); +#ifdef CONFIG_COMPAT +long compat_sys_ppc64_personality(unsigned long personality); +#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_PPC64 */ + +long sys_swapcontext(struct ucontext __user *old_ctx, + struct ucontext __user *new_ctx, long ctx_size); long sys_mmap(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, off_t offset); long sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); -long sys_ppc64_personality(unsigned long personality); -long sys_rtas(struct rtas_args __user *uargs); -long sys_ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, - u32 len_high, u32 len_low); +long sys_switch_endian(void); -#ifdef CONFIG_COMPAT -long compat_sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); - -compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, - u32 reg6, u32 pos1, u32 pos2); +#ifdef CONFIG_PPC32 +long sys_sigreturn(void); +long sys_debug_setcontext(struct ucontext __user *ctx, int ndbg, + struct sig_dbg_op __user *dbg); +#endif -compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, - u32 reg6, u32 pos1, u32 pos2); +long sys_rt_sigreturn(void); -compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count); +long sys_subpage_prot(unsigned long addr, + unsigned long len, u32 __user *map); -int compat_sys_truncate64(const char __user *path, u32 reg4, - unsigned long len1, unsigned long len2); +#ifdef CONFIG_COMPAT +long compat_sys_swapcontext(struct ucontext32 __user *old_ctx, + struct ucontext32 __user *new_ctx, + int ctx_size); +long compat_sys_old_getrlimit(unsigned int resource, + struct compat_rlimit __user *rlim); +long compat_sys_sigreturn(void); +long compat_sys_rt_sigreturn(void); +#endif /* CONFIG_COMPAT */ -int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, - unsigned long len2); +/* + * Architecture specific signatures required by long long munging: + * The 32 bit ABI passes long longs in an odd even register pair. + * The following signatures provide a machine long parameter for + * each register that will be supplied. The implementation is + * responsible for combining parameter pairs. + */ +#ifdef CONFIG_COMPAT +long compat_sys_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); +long compat_sys_ppc_pread64(unsigned int fd, + char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long compat_sys_ppc_pwrite64(unsigned int fd, + const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long compat_sys_ppc_readahead(int fd, u32 r4, + u32 offset1, u32 offset2, u32 count); +long compat_sys_ppc_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); +long compat_sys_ppc_ftruncate64(unsigned int fd, u32 reg4, + unsigned long len1, unsigned long len2); long compat_sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, size_t len, int advice); +long compat_sys_ppc_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, + unsigned int offset2, + unsigned int nbytes1, + unsigned int nbytes2); +#endif /* CONFIG_COMPAT */ -long compat_sys_sync_file_range2(int fd, unsigned int flags, - unsigned int offset1, unsigned int offset2, - unsigned int nbytes1, unsigned int nbytes2); +#if defined(CONFIG_PPC32) || defined(CONFIG_COMPAT) +long sys_ppc_fadvise64_64(int fd, int advice, + u32 offset_high, u32 offset_low, + u32 len_high, u32 len_low); #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/syscalls_32.h b/arch/powerpc/include/asm/syscalls_32.h new file mode 100644 index 000000000000..749255568be9 --- /dev/null +++ b/arch/powerpc/include/asm/syscalls_32.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_POWERPC_SYSCALLS_32_H +#define _ASM_POWERPC_SYSCALLS_32_H + +#include +#include +#include + +/* + * Data types and macros for providing 32b PowerPC support. + */ + +/* These are here to support 32-bit syscalls on a 64-bit kernel. */ + +struct pt_regs32 { + unsigned int gpr[32]; + unsigned int nip; + unsigned int msr; + unsigned int orig_gpr3; /* Used for restarting system calls */ + unsigned int ctr; + unsigned int link; + unsigned int xer; + unsigned int ccr; + unsigned int mq; /* 601 only (not used at present) */ + unsigned int trap; /* Reason for being here */ + unsigned int dar; /* Fault registers */ + unsigned int dsisr; + unsigned int result; /* Result of a system call */ +}; + +struct sigcontext32 { + unsigned int _unused[4]; + int signal; + compat_uptr_t handler; + unsigned int oldmask; + compat_uptr_t regs; /* 4 byte pointer to the pt_regs32 structure. */ +}; + +struct mcontext32 { + elf_gregset_t32 mc_gregs; + elf_fpregset_t mc_fregs; + unsigned int mc_pad[2]; + elf_vrregset_t32 mc_vregs __attribute__((__aligned__(16))); + elf_vsrreghalf_t32 mc_vsregs __attribute__((__aligned__(16))); +}; + +struct ucontext32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + int uc_pad[7]; + compat_uptr_t uc_regs; /* points to uc_mcontext field */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ + /* glibc has 1024-bit signal masks, ours are 64-bit */ + int uc_maskext[30]; + int uc_pad2[3]; + struct mcontext32 uc_mcontext; +}; + +#endif // _ASM_POWERPC_SYSCALLS_32_H diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h deleted file mode 100644 index 2346f8c7ff2e..000000000000 --- a/arch/powerpc/kernel/ppc32.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _PPC64_PPC32_H -#define _PPC64_PPC32_H - -#include -#include -#include - -/* - * Data types and macros for providing 32b PowerPC support. - */ - -/* These are here to support 32-bit syscalls on a 64-bit kernel. */ - -struct pt_regs32 { - unsigned int gpr[32]; - unsigned int nip; - unsigned int msr; - unsigned int orig_gpr3; /* Used for restarting system calls */ - unsigned int ctr; - unsigned int link; - unsigned int xer; - unsigned int ccr; - unsigned int mq; /* 601 only (not used at present) */ - unsigned int trap; /* Reason for being here */ - unsigned int dar; /* Fault registers */ - unsigned int dsisr; - unsigned int result; /* Result of a system call */ -}; - -struct sigcontext32 { - unsigned int _unused[4]; - int signal; - compat_uptr_t handler; - unsigned int oldmask; - compat_uptr_t regs; /* 4 byte pointer to the pt_regs32 structure. */ -}; - -struct mcontext32 { - elf_gregset_t32 mc_gregs; - elf_fpregset_t mc_fregs; - unsigned int mc_pad[2]; - elf_vrregset_t32 mc_vregs __attribute__((__aligned__(16))); - elf_vsrreghalf_t32 mc_vsregs __attribute__((__aligned__(16))); -}; - -struct ucontext32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - int uc_pad[7]; - compat_uptr_t uc_regs; /* points to uc_mcontext field */ - compat_sigset_t uc_sigmask; /* mask last for extensibility */ - /* glibc has 1024-bit signal masks, ours are 64-bit */ - int uc_maskext[30]; - int uc_pad2[3]; - struct mcontext32 uc_mcontext; -}; - -#endif /* _PPC64_PPC32_H */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 157a7403e3eb..c114c7f25645 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -43,7 +43,7 @@ #include #include #ifdef CONFIG_PPC64 -#include "ppc32.h" +#include #include #else #include diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c index b83c47b7947f..ea8cfe3806dc 100644 --- a/arch/powerpc/perf/callchain_32.c +++ b/arch/powerpc/perf/callchain_32.c @@ -19,7 +19,7 @@ #include "callchain.h" #ifdef CONFIG_PPC64 -#include "../kernel/ppc32.h" +#include #else /* CONFIG_PPC64 */ #define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE -- cgit v1.2.3 From 8640de0dee49cec50040d9845a2bc96fd15adc9e Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:55:58 +1000 Subject: powerpc: Use common syscall handler type Cause syscall handlers to be typed as follows when called indirectly throughout the kernel. This is to allow for better type checking. typedef long (*syscall_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); Since both 32 and 64-bit abis allow for at least the first six machine-word length parameters to a function to be passed by registers, even handlers which admit fewer than six parameters may be viewed as having the above type. Coercing syscalls to syscall_fn requires a cast to void* to avoid -Wcast-function-type. Fixup comparisons in VDSO to avoid pointer-integer comparison. Introduce explicit cast on systems with SPUs. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-19-rmclure@linux.ibm.com --- arch/powerpc/include/asm/syscall.h | 7 +++++-- arch/powerpc/include/asm/syscalls.h | 1 + arch/powerpc/kernel/syscall.c | 2 -- arch/powerpc/kernel/systbl.c | 11 ++++++++--- arch/powerpc/kernel/vdso.c | 4 ++-- arch/powerpc/platforms/cell/spu_callbacks.c | 6 +++--- 6 files changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 25fc8ad9a27a..d2a8dfd5de33 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -14,9 +14,12 @@ #include #include +typedef long (*syscall_fn)(unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long); + /* ftrace syscalls requires exporting the sys_call_table */ -extern const unsigned long sys_call_table[]; -extern const unsigned long compat_sys_call_table[]; +extern const syscall_fn sys_call_table[]; +extern const syscall_fn compat_sys_call_table[]; static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 7911738ad4a9..211bb8393dee 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_PPC64 #include #endif diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c index 64102a64fd84..9875486f6168 100644 --- a/arch/powerpc/kernel/syscall.c +++ b/arch/powerpc/kernel/syscall.c @@ -12,8 +12,6 @@ #include -typedef long (*syscall_fn)(long, long, long, long, long, long); - /* Has to run notrace because it is entered not completely "reconciled" */ notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, diff --git a/arch/powerpc/kernel/systbl.c b/arch/powerpc/kernel/systbl.c index ce52bd2ec292..d64dfafc2e5e 100644 --- a/arch/powerpc/kernel/systbl.c +++ b/arch/powerpc/kernel/systbl.c @@ -16,9 +16,14 @@ #include #define __SYSCALL_WITH_COMPAT(nr, entry, compat) __SYSCALL(nr, entry) -#define __SYSCALL(nr, entry) [nr] = (unsigned long) &entry, -const unsigned long sys_call_table[] = { +/* + * Coerce syscall handlers with arbitrary parameters to common type + * requires cast to void* to avoid -Wcast-function-type. + */ +#define __SYSCALL(nr, entry) [nr] = (void *) entry, + +const syscall_fn sys_call_table[] = { #ifdef CONFIG_PPC64 #include #else @@ -29,7 +34,7 @@ const unsigned long sys_call_table[] = { #ifdef CONFIG_COMPAT #undef __SYSCALL_WITH_COMPAT #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) -const unsigned long compat_sys_call_table[] = { +const syscall_fn compat_sys_call_table[] = { #include }; #endif /* CONFIG_COMPAT */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index bf9574ec26ce..fcca06d200d3 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -304,10 +304,10 @@ static void __init vdso_setup_syscall_map(void) unsigned int i; for (i = 0; i < NR_syscalls; i++) { - if (sys_call_table[i] != (unsigned long)&sys_ni_syscall) + if (sys_call_table[i] != (void *)&sys_ni_syscall) vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); if (IS_ENABLED(CONFIG_COMPAT) && - compat_sys_call_table[i] != (unsigned long)&sys_ni_syscall) + compat_sys_call_table[i] != (void *)&sys_ni_syscall) vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); } } diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index fe0d8797a00a..e780c14c5733 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -34,15 +34,15 @@ * mbind, mq_open, ipc, ... */ -static void *spu_syscall_table[] = { +static const syscall_fn spu_syscall_table[] = { #define __SYSCALL_WITH_COMPAT(nr, entry, compat) __SYSCALL(nr, entry) -#define __SYSCALL(nr, entry) [nr] = entry, +#define __SYSCALL(nr, entry) [nr] = (void *) entry, #include }; long spu_sys_callback(struct spu_syscall_block *s) { - long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6); + syscall_fn syscall; if (s->nr_ret >= ARRAY_SIZE(spu_syscall_table)) { pr_debug("%s: invalid syscall #%lld", __func__, s->nr_ret); -- cgit v1.2.3 From f8971c627b14040e533768985a99f4fd6ffa420f Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:56:00 +1000 Subject: powerpc: Change system_call_exception calling convention Change system_call_exception arguments to pass a pointer to a stack frame container caller state, as well as the original r0, which determines the number of the syscall. This has been observed to yield improved performance to passing them by registers, circumventing the need to allocate a stack frame. Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin [mpe: Retain clearing of high bits of args for compat tasks] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-21-rmclure@linux.ibm.com --- arch/powerpc/include/asm/interrupt.h | 3 +-- arch/powerpc/kernel/entry_32.S | 6 +++--- arch/powerpc/kernel/interrupt_64.S | 20 ++++++++++---------- arch/powerpc/kernel/syscall.c | 18 +++++++++--------- 4 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 84a1cdc3204c..0d3405bd55c4 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -665,8 +665,7 @@ static inline void interrupt_cond_local_irq_enable(struct pt_regs *regs) local_irq_enable(); } -long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, - unsigned long r0, struct pt_regs *regs); +long system_call_exception(struct pt_regs *regs, unsigned long r0); notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs, long scv); notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs); notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e3d43b6e3197..55fcc70f2cc0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -122,9 +122,9 @@ transfer_to_syscall: SAVE_NVGPRS(r1) kuep_lock - /* Calling convention has r9 = orig r0, r10 = regs */ - addi r10,r1,STACK_FRAME_OVERHEAD - mr r9,r0 + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r0 bl system_call_exception ret_from_syscall: diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 931f984b02b1..90dbd6a8d4da 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -60,7 +60,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) ld r2,PACATOC(r13) mfcr r12 li r11,0 - /* Can we avoid saving r3-r8 in common case? */ + /* Save syscall parameters in r3-r8 */ SAVE_GPRS(3, 8, r1) /* Zero r9-r12, this should only be required when restoring all GPRs */ std r11,GPR9(r1) @@ -77,9 +77,11 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) std r11,_TRAP(r1) std r12,_CCR(r1) std r3,ORIG_GPR3(r1) - addi r10,r1,STACK_FRAME_OVERHEAD + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r0 ld r11,exception_marker@toc(r2) - std r11,-16(r10) /* "regshere" marker */ + std r11,-16(r3) /* "regshere" marker */ BEGIN_FTR_SECTION HMT_MEDIUM @@ -94,8 +96,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * but this is the best we can do. */ - /* Calling convention has r9 = orig r0, r10 = regs */ - mr r9,r0 bl system_call_exception .Lsyscall_vectored_\name\()_exit: @@ -227,7 +227,7 @@ END_BTB_FLUSH_SECTION ld r2,PACATOC(r13) mfcr r12 li r11,0 - /* Can we avoid saving r3-r8 in common case? */ + /* Save syscall parameters in r3-r8 */ SAVE_GPRS(3, 8, r1) /* Zero r9-r12, this should only be required when restoring all GPRs */ std r11,GPR9(r1) @@ -250,9 +250,11 @@ END_BTB_FLUSH_SECTION std r11,_TRAP(r1) std r12,_CCR(r1) std r3,ORIG_GPR3(r1) - addi r10,r1,STACK_FRAME_OVERHEAD + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r0 ld r11,exception_marker@toc(r2) - std r11,-16(r10) /* "regshere" marker */ + std r11,-16(r3) /* "regshere" marker */ #ifdef CONFIG_PPC_BOOK3S li r11,1 @@ -273,8 +275,6 @@ END_BTB_FLUSH_SECTION wrteei 1 #endif - /* Calling convention has r9 = orig r0, r10 = regs */ - mr r9,r0 bl system_call_exception .Lsyscall_exit: diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c index 9875486f6168..2c002cbbc676 100644 --- a/arch/powerpc/kernel/syscall.c +++ b/arch/powerpc/kernel/syscall.c @@ -13,10 +13,9 @@ /* Has to run notrace because it is entered not completely "reconciled" */ -notrace long system_call_exception(long r3, long r4, long r5, - long r6, long r7, long r8, - unsigned long r0, struct pt_regs *regs) +notrace long system_call_exception(struct pt_regs *regs, unsigned long r0) { + unsigned long r3, r4, r5, r6, r7, r8; long ret; syscall_fn f; @@ -136,12 +135,6 @@ notrace long system_call_exception(long r3, long r4, long r5, r0 = do_syscall_trace_enter(regs); if (unlikely(r0 >= NR_syscalls)) return regs->gpr[3]; - r3 = regs->gpr[3]; - r4 = regs->gpr[4]; - r5 = regs->gpr[5]; - r6 = regs->gpr[6]; - r7 = regs->gpr[7]; - r8 = regs->gpr[8]; } else if (unlikely(r0 >= NR_syscalls)) { if (unlikely(trap_is_unsupported_scv(regs))) { @@ -152,6 +145,13 @@ notrace long system_call_exception(long r3, long r4, long r5, return -ENOSYS; } + r3 = regs->gpr[3]; + r4 = regs->gpr[4]; + r5 = regs->gpr[5]; + r6 = regs->gpr[6]; + r7 = regs->gpr[7]; + r8 = regs->gpr[8]; + /* May be faster to do array_index_nospec? */ barrier_nospec(); -- cgit v1.2.3 From 7e92e01b724526b98cbc7f03dd4afa0295780d56 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 21 Sep 2022 16:56:01 +1000 Subject: powerpc: Provide syscall wrapper Implement syscall wrapper as per s390, x86, arm64. When enabled cause handlers to accept parameters from a stack frame rather than from user scratch register state. This allows for user registers to be safely cleared in order to reduce caller influence on speculation within syscall routine. The wrapper is a macro that emits syscall handler symbols that call into the target handler, obtaining its parameters from a struct pt_regs on the stack. As registers are already saved to the stack prior to calling system_call_exception, it appears that this function is executed more efficiently with the new stack-pointer convention than with parameters passed by registers, avoiding the allocation of a stack frame for this method. On a 32-bit system, we see >20% performance increases on the null_syscall microbenchmark, and on a Power 8 the performance gains amortise the cost of clearing and restoring registers which is implemented at the end of this series, seeing final result of ~5.6% performance improvement on null_syscall. Syscalls are wrapped in this fashion on all platforms except for the Cell processor as this commit does not provide SPU support. This can be quickly fixed in a successive patch, but requires spu_sys_callback to allocate a pt_regs structure to satisfy the wrapped calling convention. Co-developed-by: Andrew Donnellan Signed-off-by: Andrew Donnellan Signed-off-by: Rohan McLure Reviewed-by: Nicholas Piggin [mpe: Make incompatible with COMPAT to retain clearing of high bits of args] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220921065605.1051927-22-rmclure@linux.ibm.com --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/syscall.h | 4 +++ arch/powerpc/include/asm/syscall_wrapper.h | 51 ++++++++++++++++++++++++++++++ arch/powerpc/include/asm/syscalls.h | 24 ++++++++++++-- arch/powerpc/kernel/syscall.c | 34 ++++++++++---------- arch/powerpc/kernel/systbl.c | 7 ++++ arch/powerpc/kernel/vdso.c | 2 ++ 7 files changed, 105 insertions(+), 18 deletions(-) create mode 100644 arch/powerpc/include/asm/syscall_wrapper.h (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 44d98d32e3bf..237c0e071e5e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION select ARCH_HAS_STRICT_KERNEL_RWX if PPC_85xx && !HIBERNATION && !RANDOMIZE_BASE select ARCH_HAS_STRICT_MODULE_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_SYSCALL_WRAPPER if !SPU_BASE && !COMPAT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index d2a8dfd5de33..3dd36c5e334a 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -14,8 +14,12 @@ #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +typedef long (*syscall_fn)(const struct pt_regs *); +#else typedef long (*syscall_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* ftrace syscalls requires exporting the sys_call_table */ extern const syscall_fn sys_call_table[]; diff --git a/arch/powerpc/include/asm/syscall_wrapper.h b/arch/powerpc/include/asm/syscall_wrapper.h new file mode 100644 index 000000000000..75b41b173f7a --- /dev/null +++ b/arch/powerpc/include/asm/syscall_wrapper.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - powerpc specific wrappers to syscall definitions + * + * Based on arch/{x86,arm64}/include/asm/syscall_wrapper.h + */ + +#ifndef __ASM_POWERPC_SYSCALL_WRAPPER_H +#define __ASM_POWERPC_SYSCALL_WRAPPER_H + +struct pt_regs; + +#define SC_POWERPC_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,regs->gpr[3],,regs->gpr[4],,regs->gpr[5] \ + ,,regs->gpr[6],,regs->gpr[7],,regs->gpr[8]) + +#define __SYSCALL_DEFINEx(x, name, ...) \ + long __powerpc_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__powerpc_sys##name, ERRNO); \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + long __powerpc_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_POWERPC_REGS_TO_ARGS(x,__VA_ARGS__)); \ + } \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + long __powerpc_sys_##sname(const struct pt_regs *__unused); \ + ALLOW_ERROR_INJECTION(__powerpc_sys_##sname, ERRNO); \ + long __powerpc_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL(name) \ + long __powerpc_sys_##name(const struct pt_regs *regs); \ + long __weak __powerpc_sys_##name(const struct pt_regs *regs) \ + { \ + return sys_ni_syscall(); \ + } + +#define SYS_NI(name) SYSCALL_ALIAS(__powerpc_sys_##name, sys_ni_posix_timers); + +#endif // __ASM_POWERPC_SYSCALL_WRAPPER_H diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 211bb8393dee..49bbc3e0733d 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -15,6 +15,12 @@ #include #include +#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +long sys_ni_syscall(void); +#else +long sys_ni_syscall(const struct pt_regs *regs); +#endif + struct rtas_args; /* @@ -29,12 +35,12 @@ struct rtas_args; #define merge_64(high, low) (((u64)high << 32) | low) #endif -long sys_ni_syscall(void); - /* * PowerPC architecture-specific syscalls */ +#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER + long sys_rtas(struct rtas_args __user *uargs); #ifdef CONFIG_PPC64 @@ -114,5 +120,19 @@ long sys_ppc_fadvise64_64(int fd, int advice, u32 len_high, u32 len_low); #endif +#else + +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#define __SYSCALL(nr, entry) \ + long __powerpc_##entry(const struct pt_regs *regs); + +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif /* CONFIG_PPC64 */ + +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_SYSCALLS_H */ diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c index 2c002cbbc676..18b9d325395f 100644 --- a/arch/powerpc/kernel/syscall.c +++ b/arch/powerpc/kernel/syscall.c @@ -15,7 +15,6 @@ /* Has to run notrace because it is entered not completely "reconciled" */ notrace long system_call_exception(struct pt_regs *regs, unsigned long r0) { - unsigned long r3, r4, r5, r6, r7, r8; long ret; syscall_fn f; @@ -145,31 +144,34 @@ notrace long system_call_exception(struct pt_regs *regs, unsigned long r0) return -ENOSYS; } - r3 = regs->gpr[3]; - r4 = regs->gpr[4]; - r5 = regs->gpr[5]; - r6 = regs->gpr[6]; - r7 = regs->gpr[7]; - r8 = regs->gpr[8]; - /* May be faster to do array_index_nospec? */ barrier_nospec(); +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER + // No COMPAT if we have SYSCALL_WRAPPER, see Kconfig + f = (void *)sys_call_table[r0]; + ret = f(regs); +#else if (unlikely(is_compat_task())) { + unsigned long r3, r4, r5, r6, r7, r8; + f = (void *)compat_sys_call_table[r0]; - r3 &= 0x00000000ffffffffULL; - r4 &= 0x00000000ffffffffULL; - r5 &= 0x00000000ffffffffULL; - r6 &= 0x00000000ffffffffULL; - r7 &= 0x00000000ffffffffULL; - r8 &= 0x00000000ffffffffULL; + r3 = regs->gpr[3] & 0x00000000ffffffffULL; + r4 = regs->gpr[4] & 0x00000000ffffffffULL; + r5 = regs->gpr[5] & 0x00000000ffffffffULL; + r6 = regs->gpr[6] & 0x00000000ffffffffULL; + r7 = regs->gpr[7] & 0x00000000ffffffffULL; + r8 = regs->gpr[8] & 0x00000000ffffffffULL; + ret = f(r3, r4, r5, r6, r7, r8); } else { f = (void *)sys_call_table[r0]; - } - ret = f(r3, r4, r5, r6, r7, r8); + ret = f(regs->gpr[3], regs->gpr[4], regs->gpr[5], + regs->gpr[6], regs->gpr[7], regs->gpr[8]); + } +#endif /* * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), diff --git a/arch/powerpc/kernel/systbl.c b/arch/powerpc/kernel/systbl.c index d64dfafc2e5e..9d7c5a596171 100644 --- a/arch/powerpc/kernel/systbl.c +++ b/arch/powerpc/kernel/systbl.c @@ -15,13 +15,20 @@ #include #include +#undef __SYSCALL_WITH_COMPAT #define __SYSCALL_WITH_COMPAT(nr, entry, compat) __SYSCALL(nr, entry) +#undef __SYSCALL +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +#define __SYSCALL(nr, entry) [nr] = __powerpc_##entry, +#define __powerpc_sys_ni_syscall sys_ni_syscall +#else /* * Coerce syscall handlers with arbitrary parameters to common type * requires cast to void* to avoid -Wcast-function-type. */ #define __SYSCALL(nr, entry) [nr] = (void *) entry, +#endif const syscall_fn sys_call_table[] = { #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index fcca06d200d3..e1f36fd61db3 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -39,6 +39,8 @@ extern char vdso32_start, vdso32_end; extern char vdso64_start, vdso64_end; +long sys_ni_syscall(void); + /* * The vdso data page (aka. systemcfg for old ppc64 fans) is here. * Once the early boot kernel code no longer needs to muck around -- cgit v1.2.3 From b19448fe846baad689ff51a991ebfc74b4b5e0a8 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Tue, 23 Aug 2022 01:15:01 +0200 Subject: powerpc: Add support for early debugging via Serial 16550 console MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently powerpc early debugging contains lot of platform specific options, but does not support standard UART / serial 16550 console. Later legacy_serial.c code supports registering UART as early debug console from device tree but it is not early during booting, but rather later after machine description code finishes. So for real early debugging via UART is current code unsuitable. Add support for new early debugging option CONFIG_PPC_EARLY_DEBUG_16550 which enable Serial 16550 console on address defined by new option CONFIG_PPC_EARLY_DEBUG_16550_PHYSADDR and by stride by option CONFIG_PPC_EARLY_DEBUG_16550_STRIDE. With this change it is possible to debug powerpc machine descriptor code. For example this early debugging code can print on serial console also "No suitable machine description found" error which is done before legacy_serial.c code. Signed-off-by: Pali Rohár Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220822231501.16827-1-pali@kernel.org --- arch/powerpc/Kconfig.debug | 15 +++++++++++++++ arch/powerpc/include/asm/udbg.h | 1 + arch/powerpc/kernel/udbg.c | 2 ++ arch/powerpc/kernel/udbg_16550.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index ae727d4218b9..6aaf8dc60610 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -283,6 +283,12 @@ config PPC_EARLY_DEBUG_MEMCONS This console provides input and output buffers stored within the kernel BSS and should be safe to select on any system. A debugger can then be used to read kernel output or send input to the console. + +config PPC_EARLY_DEBUG_16550 + bool "Serial 16550" + depends on PPC_UDBG_16550 + help + Select this to enable early debugging via Serial 16550 console endchoice config PPC_MEMCONS_OUTPUT_SIZE @@ -354,6 +360,15 @@ config PPC_EARLY_DEBUG_CPM_ADDR platform probing is done, all platforms selected must share the same address. +config PPC_EARLY_DEBUG_16550_PHYSADDR + hex "Early debug Serial 16550 physical address" + depends on PPC_EARLY_DEBUG_16550 + +config PPC_EARLY_DEBUG_16550_STRIDE + int "Early debug Serial 16550 stride" + depends on PPC_EARLY_DEBUG_16550 + default 1 + config FAIL_IOMMU bool "Fault-injection capability for IOMMU" depends on FAULT_INJECTION diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 524c2085070f..88add5593e6f 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -52,6 +52,7 @@ extern void __init udbg_init_ehv_bc(void); extern void __init udbg_init_ps3gelic(void); extern void __init udbg_init_debug_opal_raw(void); extern void __init udbg_init_debug_opal_hvsi(void); +extern void __init udbg_init_debug_16550(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index b1544b2f6321..92b3fc258d11 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -67,6 +67,8 @@ void __init udbg_early_init(void) udbg_init_debug_opal_raw(); #elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI) udbg_init_debug_opal_hvsi(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_16550) + udbg_init_debug_16550(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index ddfbc74bf85f..74ddf836f7a2 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -8,6 +8,7 @@ #include #include #include +#include extern u8 real_readb(volatile u8 __iomem *addr); extern void real_writeb(u8 data, volatile u8 __iomem *addr); @@ -296,3 +297,35 @@ void __init udbg_init_40x_realmode(void) } #endif /* CONFIG_PPC_EARLY_DEBUG_40x */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_16550 + +static void __iomem *udbg_uart_early_addr; + +void __init udbg_init_debug_16550(void) +{ + udbg_uart_early_addr = early_ioremap(CONFIG_PPC_EARLY_DEBUG_16550_PHYSADDR, 0x1000); + udbg_uart_init_mmio(udbg_uart_early_addr, CONFIG_PPC_EARLY_DEBUG_16550_STRIDE); +} + +static int __init udbg_init_debug_16550_ioremap(void) +{ + void __iomem *addr; + + if (!udbg_uart_early_addr) + return 0; + + addr = ioremap(CONFIG_PPC_EARLY_DEBUG_16550_PHYSADDR, 0x1000); + if (WARN_ON(!addr)) + return -ENOMEM; + + udbg_uart_init_mmio(addr, CONFIG_PPC_EARLY_DEBUG_16550_STRIDE); + early_iounmap(udbg_uart_early_addr, 0x1000); + udbg_uart_early_addr = NULL; + + return 0; +} + +early_initcall(udbg_init_debug_16550_ioremap); + +#endif /* CONFIG_PPC_EARLY_DEBUG_16550 */ -- cgit v1.2.3 From dabeb572adf24bbd7cb21d1cc4d118bdf2c2ab74 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 20 Sep 2022 22:22:58 +1000 Subject: powerpc: add ISA v3.0 / v3.1 wait opcode macro The wait instruction encoding changed between ISA v2.07 and ISA v3.0. In v3.1 the instruction gained a new field. Update the PPC_WAIT macro to the current encoding. Rename the older incompatible one with a _v203 suffix as it was introduced in v2.03 (the WC field was introduced in v2.07 but the kernel only uses WC=0). Reviewed-by: Segher Boessenkool Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220920122259.363092-1-npiggin@gmail.com --- arch/powerpc/include/asm/ppc-opcode.h | 7 +++++-- arch/powerpc/kernel/idle_64e.S | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index c6d724104ed1..21e33e46f4b8 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -330,6 +330,7 @@ #define __PPC_XSP(s) ((((s) & 0x1e) | (((s) >> 5) & 0x1)) << 21) #define __PPC_XTP(s) __PPC_XSP(s) #define __PPC_T_TLB(t) (((t) & 0x3) << 21) +#define __PPC_PL(p) (((p) & 0x3) << 16) #define __PPC_WC(w) (((w) & 0x3) << 21) #define __PPC_WS(w) (((w) & 0x1f) << 11) #define __PPC_SH(s) __PPC_WS(s) @@ -388,7 +389,8 @@ #define PPC_RAW_RFDI (0x4c00004e) #define PPC_RAW_RFMCI (0x4c00004c) #define PPC_RAW_TLBILX(t, a, b) (0x7c000024 | __PPC_T_TLB(t) | __PPC_RA0(a) | __PPC_RB(b)) -#define PPC_RAW_WAIT(w) (0x7c00007c | __PPC_WC(w)) +#define PPC_RAW_WAIT_v203 (0x7c00007c) +#define PPC_RAW_WAIT(w, p) (0x7c00003c | __PPC_WC(w) | __PPC_PL(p)) #define PPC_RAW_TLBIE(lp, a) (0x7c000264 | ___PPC_RB(a) | ___PPC_RS(lp)) #define PPC_RAW_TLBIE_5(rb, rs, ric, prs, r) \ (0x7c000264 | ___PPC_RB(rb) | ___PPC_RS(rs) | ___PPC_RIC(ric) | ___PPC_PRS(prs) | ___PPC_R(r)) @@ -606,7 +608,8 @@ #define PPC_TLBILX_ALL(a, b) PPC_TLBILX(0, a, b) #define PPC_TLBILX_PID(a, b) PPC_TLBILX(1, a, b) #define PPC_TLBILX_VA(a, b) PPC_TLBILX(3, a, b) -#define PPC_WAIT(w) stringify_in_c(.long PPC_RAW_WAIT(w)) +#define PPC_WAIT_v203 stringify_in_c(.long PPC_RAW_WAIT_v203) +#define PPC_WAIT(w, p) stringify_in_c(.long PPC_RAW_WAIT(w, p)) #define PPC_TLBIE(lp, a) stringify_in_c(.long PPC_RAW_TLBIE(lp, a)) #define PPC_TLBIE_5(rb, rs, ric, prs, r) \ stringify_in_c(.long PPC_RAW_TLBIE_5(rb, rs, ric, prs, r)) diff --git a/arch/powerpc/kernel/idle_64e.S b/arch/powerpc/kernel/idle_64e.S index 1736aad2afe9..0fc680e03dee 100644 --- a/arch/powerpc/kernel/idle_64e.S +++ b/arch/powerpc/kernel/idle_64e.S @@ -75,7 +75,7 @@ _GLOBAL(\name) .macro BOOK3E_IDLE_LOOP 1: - PPC_WAIT(0) + PPC_WAIT_v203 b 1b .endm -- cgit v1.2.3 From 9c7bfc2dc21e737e8e4a753630bce675e1e7c0ad Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 20 Sep 2022 22:22:59 +1000 Subject: powerpc/64s: Make POWER10 and later use pause_short in cpu_relax loops We want to move away from using SMT priority updates for cpu_relax, and use a 'wait' instruction which is similar to x86. As well as being a much better fit for what everybody else uses and tests with, priority nops are stateful which is nasty (interrupts have to consider they might be taken at a different priority), and they're expensive to execute, similar to a mtSPR which can effect other threads in the pipe. This has shown to give results that are less affected by code alignment on benchmarks that cause a lot of spin waiting (e.g., rwsem contention on unixbench filesystem benchmarks) on POWER10. QEMU TCG only supports this instruction correctly since v7.1, versions without the fix may cause hangs whne running POWER10 CPUs. Reviewed-by: Segher Boessenkool Signed-off-by: Nicholas Piggin [mpe: Fix checkpatch warnings RE the macros] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220920122259.363092-2-npiggin@gmail.com --- arch/powerpc/include/asm/processor.h | 22 +++++++++++++++++----- arch/powerpc/include/asm/vdso/processor.h | 8 +++++++- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 97a77b37daa3..9bff4ab8242d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -355,11 +355,23 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #ifdef CONFIG_PPC64 -#define spin_begin() HMT_low() - -#define spin_cpu_relax() barrier() - -#define spin_end() HMT_medium() +#define spin_begin() \ + asm volatile(ASM_FTR_IFCLR( \ + "or 1,1,1", /* HMT_LOW */ \ + "nop", /* v3.1 uses pause_short in cpu_relax instead */ \ + %0) :: "i" (CPU_FTR_ARCH_31) : "memory") + +#define spin_cpu_relax() \ + asm volatile(ASM_FTR_IFCLR( \ + "nop", /* Before v3.1 use priority nops in spin_begin/end */ \ + PPC_WAIT(2, 0), /* aka pause_short */ \ + %0) :: "i" (CPU_FTR_ARCH_31) : "memory") + +#define spin_end() \ + asm volatile(ASM_FTR_IFCLR( \ + "or 2,2,2", /* HMT_MEDIUM */ \ + "nop", \ + %0) :: "i" (CPU_FTR_ARCH_31) : "memory") #endif diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index 8d79f994b4aa..80d13207c568 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -22,7 +22,13 @@ #endif #ifdef CONFIG_PPC64 -#define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) +#define cpu_relax() \ + asm volatile(ASM_FTR_IFCLR( \ + /* Pre-POWER10 uses low ; medium priority nops */ \ + "or 1,1,1 ; or 2,2,2", \ + /* POWER10 onward uses pause_short (wait 2,0) */ \ + PPC_WAIT(2, 0), \ + %0) :: "i" (CPU_FTR_ARCH_31) : "memory") #else #define cpu_relax() barrier() #endif -- cgit v1.2.3 From a5edf9815dd739fce660b4c8658f61b7d2517042 Mon Sep 17 00:00:00 2001 From: Nicholas Miehlbradt Date: Mon, 26 Sep 2022 07:57:26 +0000 Subject: powerpc/64s: Enable KFENCE on book3s64 KFENCE support was added for ppc32 in commit 90cbac0e995d ("powerpc: Enable KFENCE for PPC32"). Enable KFENCE on ppc64 architecture with hash and radix MMUs. It uses the same mechanism as debug pagealloc to protect/unprotect pages. All KFENCE kunit tests pass on both MMUs. KFENCE memory is initially allocated using memblock but is later marked as SLAB allocated. This necessitates the change to __pud_free to ensure that the KFENCE pages are freed appropriately. Based on previous work by Christophe Leroy and Jordan Niethe. Signed-off-by: Nicholas Miehlbradt Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926075726.2846-4-nicholas@linux.ibm.com --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgalloc.h | 6 ++++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- arch/powerpc/include/asm/kfence.h | 15 +++++++++++++++ arch/powerpc/mm/book3s64/hash_utils.c | 10 +++++----- arch/powerpc/mm/book3s64/radix_pgtable.c | 6 ++++-- 6 files changed, 30 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 237c0e071e5e..81c9f895d690 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -195,7 +195,7 @@ config PPC select HAVE_ARCH_KASAN if PPC_RADIX_MMU select HAVE_ARCH_KASAN if PPC_BOOK3E_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN - select HAVE_ARCH_KFENCE if PPC_BOOK3S_32 || PPC_8xx || 40x + select HAVE_ARCH_KFENCE if ARCH_SUPPORTS_DEBUG_PAGEALLOC select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index e1af0b394ceb..dd2cff53a111 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -113,9 +113,11 @@ static inline void __pud_free(pud_t *pud) /* * Early pud pages allocated via memblock allocator - * can't be directly freed to slab + * can't be directly freed to slab. KFENCE pages have + * both reserved and slab flags set so need to be freed + * kmem_cache_free. */ - if (PageReserved(page)) + if (PageReserved(page) && !PageSlab(page)) free_reserved_page(page); else kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b5f8264cc980..c436d8422654 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1106,7 +1106,7 @@ static inline void vmemmap_remove_mapping(unsigned long start, } #endif -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static inline void __kernel_map_pages(struct page *page, int numpages, int enable) { if (radix_enabled()) diff --git a/arch/powerpc/include/asm/kfence.h b/arch/powerpc/include/asm/kfence.h index a9846b68c6b9..6fd2b4d486c5 100644 --- a/arch/powerpc/include/asm/kfence.h +++ b/arch/powerpc/include/asm/kfence.h @@ -11,11 +11,25 @@ #include #include +#ifdef CONFIG_PPC64_ELF_ABI_V1 +#define ARCH_FUNC_PREFIX "." +#endif + static inline bool arch_kfence_init_pool(void) { return true; } +#ifdef CONFIG_PPC64 +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + struct page *page = virt_to_page(addr); + + __kernel_map_pages(page, 1, !protect); + + return true; +} +#else static inline bool kfence_protect_page(unsigned long addr, bool protect) { pte_t *kpte = virt_to_kpte(addr); @@ -29,5 +43,6 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) return true; } +#endif #endif /* __ASM_POWERPC_KFENCE_H */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 6d985acb1d39..df008edf7be0 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -424,7 +424,7 @@ repeat: break; cond_resched(); - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled_or_kfence() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; } @@ -773,7 +773,7 @@ static void __init htab_init_page_sizes(void) bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled()) { + if (!debug_pagealloc_enabled_or_kfence()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default @@ -1061,7 +1061,7 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); - if (debug_pagealloc_enabled()) { + if (debug_pagealloc_enabled_or_kfence()) { linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; linear_map_hash_slots = memblock_alloc_try_nid( linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, @@ -1980,7 +1980,7 @@ repeat: return slot; } -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static DEFINE_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2053,7 +2053,7 @@ void hash__kernel_map_pages(struct page *page, int numpages, int enable) } local_irq_restore(flags); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 94a05c66be95..70f1580afc3a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -34,6 +34,8 @@ #include +#include + unsigned int mmu_base_pid; unsigned long radix_mem_block_size __ro_after_init; @@ -276,7 +278,7 @@ static int __meminit create_physical_mapping(unsigned long start, int psize; unsigned long max_mapping_size = radix_mem_block_size; - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_or_kfence()) max_mapping_size = PAGE_SIZE; start = ALIGN(start, PAGE_SIZE); @@ -899,7 +901,7 @@ void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long #endif #endif -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) void radix__kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long addr; -- cgit v1.2.3 From 56adbb7a8b6cc7fc9b940829c38494e53c9e57d1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 15:42:59 +1000 Subject: powerpc/64/interrupt: Fix false warning in context tracking due to idle state Commit 171476775d32 ("context_tracking: Convert state to atomic_t") added a CONTEXT_IDLE state which can be encountered by interrupts from kernel mode in the idle thread, causing a false positive warning. Fixes: 171476775d32 ("context_tracking: Convert state to atomic_t") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926054305.2671436-2-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 0d3405bd55c4..73031783bb1e 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -195,7 +195,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) * so avoid recursion. */ if (TRAP(regs) != INTERRUPT_PROGRAM) { - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL && + ct_state() != CONTEXT_IDLE); if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) BUG_ON(is_implicit_soft_masked(regs)); } -- cgit v1.2.3 From 9524f2278f2e6925f147d9140c83f658e7a7c84f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 15:43:02 +1000 Subject: powerpc/64s: Fix irq state management in runlatch functions When irqs are soft-disabled, MSR[EE] is volatile and can change from 1 to 0 asynchronously (if a PACA_IRQ_MUST_HARD_MASK interrupt hits). So it can not be used to check hard IRQ enabled status, except to confirm it is disabled. ppc64_runlatch_on/off functions use MSR this way to decide whether to re-enable MSR[EE] after disabling it, which leads to MSR[EE] being enabled when it shouldn't be (when a PACA_IRQ_MUST_HARD_MASK had disabled it between reading the MSR and clearing EE). This has been tolerated in the kernel previously, and it doesn't seem to cause a problem, but it is unexpected and may trip warnings or cause other problems as we tighten up this state management. Fix this by only re-enabling if PACA_IRQ_HARD_DIS is clear. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926054305.2671436-5-npiggin@gmail.com --- arch/powerpc/include/asm/runlatch.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/runlatch.h b/arch/powerpc/include/asm/runlatch.h index cfb390edf7d0..ceb66d761fe1 100644 --- a/arch/powerpc/include/asm/runlatch.h +++ b/arch/powerpc/include/asm/runlatch.h @@ -19,10 +19,9 @@ extern void __ppc64_runlatch_off(void); do { \ if (cpu_has_feature(CPU_FTR_CTRL) && \ test_thread_local_flags(_TLF_RUNLATCH)) { \ - unsigned long msr = mfmsr(); \ __hard_irq_disable(); \ __ppc64_runlatch_off(); \ - if (msr & MSR_EE) \ + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) \ __hard_irq_enable(); \ } \ } while (0) @@ -31,10 +30,9 @@ extern void __ppc64_runlatch_off(void); do { \ if (cpu_has_feature(CPU_FTR_CTRL) && \ !test_thread_local_flags(_TLF_RUNLATCH)) { \ - unsigned long msr = mfmsr(); \ __hard_irq_disable(); \ __ppc64_runlatch_on(); \ - if (msr & MSR_EE) \ + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) \ __hard_irq_enable(); \ } \ } while (0) -- cgit v1.2.3 From f7bff6e7759b1abb59334f6448f9ef3172c4c04a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 15:43:04 +1000 Subject: powerpc/64/interrupt: avoid BUG/WARN recursion in interrupt entry BUG/WARN are handled with a program interrupt which can turn into an infinite recursion when there are bugs in interrupt handler entry (which can be irritated by bugs in other parts of the code). There is one feeble attempt to avoid this recursion, but it misses several cases. Make a tidier macro for this and switch most bugs in the interrupt entry wrapper over to use it. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926054305.2671436-7-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 73031783bb1e..4745bb9998bd 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -74,6 +74,19 @@ #include #include +#ifdef CONFIG_PPC64 +/* + * WARN/BUG is handled with a program interrupt so minimise checks here to + * avoid recursion and maximise the chance of getting the first oops handled. + */ +#define INT_SOFT_MASK_BUG_ON(regs, cond) \ +do { \ + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && \ + (user_mode(regs) || (TRAP(regs) != INTERRUPT_PROGRAM))) \ + BUG_ON(cond); \ +} while (0) +#endif + #ifdef CONFIG_PPC_BOOK3S_64 extern char __end_soft_masked[]; bool search_kernel_soft_mask_table(unsigned long addr); @@ -170,8 +183,7 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) * context. */ if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) { - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - BUG_ON(!(regs->msr & MSR_EE)); + INT_SOFT_MASK_BUG_ON(regs, !(regs->msr & MSR_EE)); __hard_irq_enable(); } else { __hard_RI_enable(); @@ -194,20 +206,15 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. */ - if (TRAP(regs) != INTERRUPT_PROGRAM) { + if (TRAP(regs) != INTERRUPT_PROGRAM) CT_WARN_ON(ct_state() != CONTEXT_KERNEL && ct_state() != CONTEXT_IDLE); - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - BUG_ON(is_implicit_soft_masked(regs)); - } - - /* Move this under a debugging check */ - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && - arch_irq_disabled_regs(regs)) - BUG_ON(search_kernel_restart_table(regs->nip)); + INT_SOFT_MASK_BUG_ON(regs, is_implicit_soft_masked(regs)); + INT_SOFT_MASK_BUG_ON(regs, arch_irq_disabled_regs(regs) && + search_kernel_restart_table(regs->nip)); } - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE)); + INT_SOFT_MASK_BUG_ON(regs, !arch_irq_disabled_regs(regs) && + !(regs->msr & MSR_EE)); #endif booke_restore_dbcr0(); -- cgit v1.2.3 From 17773afdcd1589c5925a984f512330410cb2ba4f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 13:40:53 +1000 Subject: powerpc/64: use 32-bit immediate for STACK_FRAME_REGS_MARKER Using a 32-bit constant for this marker allows it to be loaded with two ALU instructions, like 32-bit. This avoids a TOC entry and a TOC load that depends on the r2 value that has just been loaded from the PACA. This changes the value for 32-bit as well, so both have the same value in the low 4 bytes and 64-bit has 0 in the top bytes. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926034057.2360083-2-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 4 ++-- arch/powerpc/kernel/entry_32.S | 6 +++--- arch/powerpc/kernel/exceptions-64e.S | 8 +------- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kernel/head_64.S | 7 ------- arch/powerpc/kernel/interrupt_64.S | 6 +++--- 6 files changed, 10 insertions(+), 23 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index a03403695cd4..5b496e589d54 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -99,6 +99,8 @@ struct pt_regs #define STACK_FRAME_WITH_PT_REGS (STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)) +#define STACK_FRAME_REGS_MARKER ASM_CONST(0x72656773) + #ifdef __powerpc64__ /* @@ -115,7 +117,6 @@ struct pt_regs #define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ -#define STACK_FRAME_REGS_MARKER ASM_CONST(0x7265677368657265) #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + \ STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) #define STACK_FRAME_MARKER 12 @@ -136,7 +137,6 @@ struct pt_regs #define KERNEL_REDZONE_SIZE 0 #define STACK_FRAME_OVERHEAD 16 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 1 /* Location of LR in stack frame */ -#define STACK_FRAME_REGS_MARKER ASM_CONST(0x72656773) #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 55fcc70f2cc0..3fc7c9886bb7 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -263,7 +263,7 @@ fast_exception_return: mtcr r10 lwz r10,_LINK(r11) mtlr r10 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + /* Clear the exception marker on the stack to avoid confusing stacktrace */ li r10, 0 stw r10, 8(r11) REST_GPR(10, r11) @@ -320,7 +320,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) li r0,0 /* - * Leaving a stale exception_marker on the stack can confuse + * Leaving a stale exception marker on the stack can confuse * the reliable stack unwinder later on. Clear it. */ stw r0,8(r1) @@ -372,7 +372,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) mtspr SPRN_XER,r5 /* - * Leaving a stale exception_marker on the stack can confuse + * Leaving a stale exception marker on the stack can confuse * the reliable stack unwinder later on. Clear it. */ stw r0,8(r1) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index d6b7aa0229be..478127153529 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -378,7 +378,7 @@ exc_##n##_common: \ ld r9,excf+EX_R1(r13); /* load orig r1 back from PACA */ \ lwz r10,excf+EX_CR(r13); /* load orig CR back from PACA */ \ lbz r11,PACAIRQSOFTMASK(r13); /* get current IRQ softe */ \ - ld r12,exception_marker@toc(r2); \ + LOAD_REG_IMMEDIATE(r12, STACK_FRAME_REGS_MARKER); \ ZEROIZE_GPR(0); \ std r3,GPR10(r1); /* save r10 to stackframe */ \ std r4,GPR11(r1); /* save r11 to stackframe */ \ @@ -459,12 +459,6 @@ exc_##n##_bad_stack: \ bl hdlr; \ b interrupt_return -/* This value is used to mark exception frames on the stack. */ - .section ".toc","aw" -exception_marker: - .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER - - /* * And here we have the exception vectors ! */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 43e538502f52..3b56fc689a04 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -589,7 +589,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) li r9,IVEC std r9,_TRAP(r1) /* set trap number */ li r10,0 - ld r11,exception_marker@toc(r2) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r10,RESULT(r1) /* clear regs->result */ std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ .endm diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index da544ea0ce01..c79da95dfd2e 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -192,13 +192,6 @@ __secondary_hold: #endif CLOSE_FIXED_SECTION(first_256B) -/* This value is used to mark exception frames on the stack. */ - .section ".toc","aw" -/* This value is used to mark exception frames on the stack. */ -exception_marker: - .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER - .previous - /* * On server, we include the exception vectors code here as it * relies on absolute addressing which is only possible within diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 1dfa67e9199a..f02c55930b5c 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -80,7 +80,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) /* Calling convention has r3 = regs, r4 = orig r0 */ addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r0 - ld r11,exception_marker@toc(r2) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r11,-16(r3) /* "regshere" marker */ BEGIN_FTR_SECTION @@ -253,7 +253,7 @@ END_BTB_FLUSH_SECTION /* Calling convention has r3 = regs, r4 = orig r0 */ addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r0 - ld r11,exception_marker@toc(r2) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r11,-16(r3) /* "regshere" marker */ #ifdef CONFIG_PPC_BOOK3S @@ -614,7 +614,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) mtspr SPRN_XER,r5 /* - * Leaving a stale exception_marker on the stack can confuse + * Leaving a stale STACK_FRAME_REGS_MARKER on the stack can confuse * the reliable stack unwinder later on. Clear it. */ std r0,STACK_FRAME_OVERHEAD-16(r1) -- cgit v1.2.3 From 754f611774e4b9357a944f5b703dd291c85161cf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 13:40:55 +1000 Subject: powerpc/64: switch asm helpers from GOT to TOC relative addressing There is no need to use GOT addressing within the kernel. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926034057.2360083-4-npiggin@gmail.com --- arch/powerpc/boot/ppc_asm.h | 3 ++- arch/powerpc/include/asm/ppc_asm.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h index 2824a3e32aab..a66cfd76fa4d 100644 --- a/arch/powerpc/boot/ppc_asm.h +++ b/arch/powerpc/boot/ppc_asm.h @@ -86,7 +86,8 @@ #ifdef CONFIG_PPC64_BOOT_WRAPPER #define LOAD_REG_ADDR(reg,name) \ - ld reg,name@got(r2) + addis reg,r2,name@toc@ha; \ + addi reg,reg,name@toc@l #else #define LOAD_REG_ADDR(reg,name) \ lis reg,name@ha; \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index eeb7dc8cd45f..9972f3e90c9d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -337,7 +337,8 @@ n: rldimi reg, tmp, 32, 0 #define LOAD_REG_ADDR(reg,name) \ - ld reg,name@got(r2) + addis reg,r2,name@toc@ha; \ + addi reg,reg,name@toc@l #define LOAD_REG_ADDRBASE(reg,name) LOAD_REG_ADDR(reg,name) #define ADDROFF(name) 0 -- cgit v1.2.3 From 8e93fb33c84f68db20c0bc2821334a4c54c3e251 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 13:40:56 +1000 Subject: powerpc/64: provide a helper macro to load r2 with the kernel TOC A later change stops the kernel using r2 and loads it with a poison value. Provide a PACATOC loading abstraction which can hide this detail. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926034057.2360083-5-npiggin@gmail.com --- arch/powerpc/include/asm/ppc_asm.h | 6 ++++++ arch/powerpc/kernel/exceptions-64e.S | 12 ++++++------ arch/powerpc/kernel/exceptions-64s.S | 6 +++--- arch/powerpc/kernel/head_64.S | 4 ++-- arch/powerpc/kernel/interrupt_64.S | 12 ++++++------ arch/powerpc/kernel/optprobes_head.S | 2 +- arch/powerpc/kernel/trace/ftrace_low.S | 2 +- arch/powerpc/kernel/trace/ftrace_mprofile.S | 3 +-- arch/powerpc/kvm/book3s_64_entry.S | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++-- arch/powerpc/kvm/tm.S | 2 +- arch/powerpc/mm/nohash/tlb_low_64e.S | 2 +- arch/powerpc/platforms/powernv/opal-wrappers.S | 2 +- 13 files changed, 32 insertions(+), 27 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 9972f3e90c9d..cf6bec9770d6 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -327,6 +327,12 @@ n: #ifdef __powerpc64__ +#define __LOAD_PACA_TOC(reg) \ + ld reg,PACATOC(r13) + +#define LOAD_PACA_TOC() \ + __LOAD_PACA_TOC(r2) + #define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr #define LOAD_REG_IMMEDIATE_SYM(reg, tmp, expr) \ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 478127153529..ece1bd2a4a39 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -371,7 +371,7 @@ exc_##n##_common: \ ld r4,excf+EX_R11(r13); /* get back r11 */ \ mfspr r5,scratch; /* get back r13 */ \ SAVE_GPR(12, r1); /* save r12 in stackframe */ \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + LOAD_PACA_TOC(); /* get kernel TOC into r2 */ \ mflr r6; /* save LR in stackframe */ \ mfctr r7; /* save CTR in stackframe */ \ mfspr r8,SPRN_XER; /* save XER in stackframe */ \ @@ -687,7 +687,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) beq+ 1f #ifdef CONFIG_RELOCATABLE - ld r15,PACATOC(r13) + __LOAD_PACA_TOC(r15) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) cmpld cr0,r10,r14 @@ -758,7 +758,7 @@ kernel_dbg_exc: beq+ 1f #ifdef CONFIG_RELOCATABLE - ld r15,PACATOC(r13) + __LOAD_PACA_TOC(r15) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) cmpld cr0,r10,r14 @@ -883,7 +883,7 @@ kernel_dbg_exc: .macro SEARCH_RESTART_TABLE #ifdef CONFIG_RELOCATABLE - ld r11,PACATOC(r13) + __LOAD_PACA_TOC(r11) ld r14,__start___restart_table@got(r11) ld r15,__stop___restart_table@got(r11) #else @@ -1061,7 +1061,7 @@ bad_stack_book3e: std r11,0(r1) ZEROIZE_GPR(12) std r12,0(r11) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b @@ -1302,7 +1302,7 @@ a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ #ifdef CONFIG_RELOCATABLE - ld r5,PACATOC(r13) + __LOAD_PACA_TOC(r5) ld r3,1f@got(r5) #else LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3b56fc689a04..5956ad47a8a0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -580,7 +580,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) std r2,GPR2(r1) /* save r2 in stackframe */ SAVE_GPRS(3, 8, r1) /* save r3 - r8 in stackframe */ mflr r9 /* Get LR, later save to stack */ - ld r2,PACATOC(r13) /* get kernel TOC into r2 */ + LOAD_PACA_TOC() /* get kernel TOC into r2 */ std r9,_LINK(r1) lbz r10,PACAIRQSOFTMASK(r13) mfspr r11,SPRN_XER /* save XER in stackframe */ @@ -610,7 +610,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) .macro SEARCH_RESTART_TABLE #ifdef CONFIG_RELOCATABLE mr r12,r2 - ld r2,PACATOC(r13) + LOAD_PACA_TOC() LOAD_REG_ADDR(r9, __start___restart_table) LOAD_REG_ADDR(r10, __stop___restart_table) mr r2,r12 @@ -640,7 +640,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) .macro SEARCH_SOFT_MASK_TABLE #ifdef CONFIG_RELOCATABLE mr r12,r2 - ld r2,PACATOC(r13) + LOAD_PACA_TOC() LOAD_REG_ADDR(r9, __start___soft_mask_table) LOAD_REG_ADDR(r10, __stop___soft_mask_table) mr r2,r12 diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index c79da95dfd2e..c63e0c086f03 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -841,7 +841,7 @@ __secondary_start: * before going into C code. */ start_secondary_prolog: - ld r2,PACATOC(r13) + LOAD_PACA_TOC() li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ bl start_secondary @@ -981,7 +981,7 @@ start_here_common: std r1,PACAKSAVE(r13) /* Load the TOC (virtual address) */ - ld r2,PACATOC(r13) + LOAD_PACA_TOC() /* Mark interrupts soft and hard disabled (they might be enabled * in the PACA when doing hotplug) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index f02c55930b5c..904a5608cbe3 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -57,7 +57,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) std r0,GPR0(r1) std r10,GPR1(r1) std r2,GPR2(r1) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() mfcr r12 li r11,0 /* Save syscall parameters in r3-r8 */ @@ -174,7 +174,7 @@ syscall_vectored_\name\()_restart: _ASM_NOKPROBE_SYMBOL(syscall_vectored_\name\()_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() ld r3,RESULT(r1) addi r4,r1,STACK_FRAME_OVERHEAD li r11,IRQS_ALL_DISABLED @@ -224,7 +224,7 @@ START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION #endif - ld r2,PACATOC(r13) + LOAD_PACA_TOC() mfcr r12 li r11,0 /* Save syscall parameters in r3-r8 */ @@ -355,7 +355,7 @@ syscall_restart: _ASM_NOKPROBE_SYMBOL(syscall_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() ld r3,RESULT(r1) addi r4,r1,STACK_FRAME_OVERHEAD li r11,IRQS_ALL_DISABLED @@ -502,7 +502,7 @@ interrupt_return_\srr\()_user_restart: _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() addi r3,r1,STACK_FRAME_OVERHEAD li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) @@ -663,7 +663,7 @@ interrupt_return_\srr\()_kernel_restart: _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) - ld r2,PACATOC(r13) + LOAD_PACA_TOC() addi r3,r1,STACK_FRAME_OVERHEAD li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index 5c7f0b4b784b..cd4e7bc32609 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -73,7 +73,7 @@ optprobe_template_entry: * further below. */ #ifdef CONFIG_PPC64 - ld r2,PACATOC(r13) + LOAD_PACA_TOC() #endif .global optprobe_template_op_address diff --git a/arch/powerpc/kernel/trace/ftrace_low.S b/arch/powerpc/kernel/trace/ftrace_low.S index 0bddf1fa6636..294d1e05958a 100644 --- a/arch/powerpc/kernel/trace/ftrace_low.S +++ b/arch/powerpc/kernel/trace/ftrace_low.S @@ -48,7 +48,7 @@ _GLOBAL(return_to_handler) * We might be called from a module. * Switch to our TOC to run inside the core kernel. */ - ld r2, PACATOC(r13) + LOAD_PACA_TOC() #else stwu r1, -16(r1) stw r3, 8(r1) diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S b/arch/powerpc/kernel/trace/ftrace_mprofile.S index 33fcfb2eaded..d031093bc436 100644 --- a/arch/powerpc/kernel/trace/ftrace_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S @@ -83,8 +83,7 @@ #ifdef CONFIG_PPC64 /* Save callee's TOC in the ABI compliant location */ std r2, STK_GOT(r1) - ld r2,PACATOC(r13) /* get kernel TOC in r2 */ - + LOAD_PACA_TOC() /* get kernel TOC in r2 */ LOAD_REG_ADDR(r3, function_trace_op) ld r5,0(r3) #else diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index e43704547a1e..6c2b1d17cb63 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -315,7 +315,7 @@ kvmppc_p9_exit_interrupt: reg = reg + 1 .endr - ld r2,PACATOC(r13) + LOAD_PACA_TOC() mflr r4 std r4,VCPU_LR(r3) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7ded202bf995..c984021e62c8 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1024,7 +1024,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Restore R1/R2 so we can handle faults */ ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) + LOAD_PACA_TOC() mfspr r10, SPRN_SRR0 mfspr r11, SPRN_SRR1 @@ -2727,7 +2727,7 @@ kvmppc_bad_host_intr: std r4, _CTR(r1) std r5, _XER(r1) std r6, SOFTE(r1) - ld r2, PACATOC(r13) + LOAD_PACA_TOC() LOAD_REG_IMMEDIATE(3, 0x7265677368657265) std r3, STACK_FRAME_OVERHEAD-16(r1) diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S index 3bf17c854be4..2158f61e317f 100644 --- a/arch/powerpc/kvm/tm.S +++ b/arch/powerpc/kvm/tm.S @@ -110,7 +110,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) mtmsrd r2, 1 /* Reload TOC pointer. */ - ld r2, PACATOC(r13) + LOAD_PACA_TOC() /* Save all but r0-r2, r9 & r13 */ reg = 3 diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index be26f33a6ac0..b3b3dfeec8f5 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -1118,7 +1118,7 @@ tlb_load_linear: * we only use 1G pages for now. That might have to be changed in a * final implementation, especially when dealing with hypervisors */ - ld r11,PACATOC(r13) + __LOAD_PACA_TOC(r11) ld r11,linear_map_top@got(r11) ld r10,0(r11) tovirt(10,10) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index e5acc33b3b20..0ed95f753416 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -57,7 +57,7 @@ opal_return: .long 0xa64b7b7d /* mthsrr1 r11 */ .long 0x2402004c /* hrfid */ #endif - ld r2,PACATOC(r13) + LOAD_PACA_TOC() ld r0,PPC_LR_STKOFF(r1) mtlr r0 blr -- cgit v1.2.3 From 3569d84bb26f6f07d426446da3d2c836180f1565 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 13:40:57 +1000 Subject: powerpc/64e: provide an addressing macro for use with TOC in alternate register The interrupt entry code carefully saves a minimal number of registers, so in some places the TOC is required, it is loaded into a different register, so provide a macro that can supply an alternate TOC register. This continues to use got addressing because TOC-relative results in "got/toc optimization is not supported" messages by the linker. Having r2 be one of the saved registers and using that for TOC addressing may be the best way to avoid that and switch this to TOC addressing. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926034057.2360083-6-npiggin@gmail.com --- arch/powerpc/include/asm/ppc_asm.h | 11 +++++++++++ arch/powerpc/kernel/exceptions-64e.S | 14 +++++++------- arch/powerpc/mm/nohash/tlb_low_64e.S | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index cf6bec9770d6..753a2757bcd4 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -346,6 +346,17 @@ n: addis reg,r2,name@toc@ha; \ addi reg,reg,name@toc@l +#ifdef CONFIG_PPC_BOOK3E_64 +/* + * This is used in register-constrained interrupt handlers. Not to be used + * by BOOK3S. ld complains with "got/toc optimization is not supported" if r2 + * is not used for the TOC offset, so use @got(tocreg). If the interrupt + * handlers saved r2 instead, LOAD_REG_ADDR could be used. + */ +#define LOAD_REG_ADDR_ALTTOC(reg,tocreg,name) \ + ld reg,name@got(tocreg) +#endif + #define LOAD_REG_ADDRBASE(reg,name) LOAD_REG_ADDR(reg,name) #define ADDROFF(name) 0 diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index ece1bd2a4a39..930e36099015 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -688,8 +688,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #ifdef CONFIG_RELOCATABLE __LOAD_PACA_TOC(r15) - ld r14,interrupt_base_book3e@got(r15) - ld r15,__end_interrupts@got(r15) + LOAD_REG_ADDR_ALTTOC(r14, r15, interrupt_base_book3e) + LOAD_REG_ADDR_ALTTOC(r15, r15, __end_interrupts) cmpld cr0,r10,r14 cmpld cr1,r10,r15 #else @@ -759,8 +759,8 @@ kernel_dbg_exc: #ifdef CONFIG_RELOCATABLE __LOAD_PACA_TOC(r15) - ld r14,interrupt_base_book3e@got(r15) - ld r15,__end_interrupts@got(r15) + LOAD_REG_ADDR_ALTTOC(r14, r15, interrupt_base_book3e) + LOAD_REG_ADDR_ALTTOC(r15, r15, __end_interrupts) cmpld cr0,r10,r14 cmpld cr1,r10,r15 #else @@ -884,8 +884,8 @@ kernel_dbg_exc: .macro SEARCH_RESTART_TABLE #ifdef CONFIG_RELOCATABLE __LOAD_PACA_TOC(r11) - ld r14,__start___restart_table@got(r11) - ld r15,__stop___restart_table@got(r11) + LOAD_REG_ADDR_ALTTOC(r14, r11, __start___restart_table) + LOAD_REG_ADDR_ALTTOC(r15, r11, __stop___restart_table) #else LOAD_REG_IMMEDIATE_SYM(r14, r11, __start___restart_table) LOAD_REG_IMMEDIATE_SYM(r15, r11, __stop___restart_table) @@ -1303,7 +1303,7 @@ a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ #ifdef CONFIG_RELOCATABLE __LOAD_PACA_TOC(r5) - ld r3,1f@got(r5) + LOAD_REG_ADDR_ALTTOC(r3, r5, 1f) #else LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) #endif diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index b3b3dfeec8f5..76cf456d7976 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -1119,7 +1119,7 @@ tlb_load_linear: * final implementation, especially when dealing with hypervisors */ __LOAD_PACA_TOC(r11) - ld r11,linear_map_top@got(r11) + LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top) ld r10,0(r11) tovirt(10,10) cmpld cr0,r16,r10 -- cgit v1.2.3 From 2f5182cffa43f31c241131a2c10a4ecd8e90fb3e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Sep 2022 15:56:17 +1000 Subject: powerpc/64s: early boot machine check handler Use the early boot interrupt fixup in the machine check handler to allow the machine check handler to run before interrupt endian is set up. Branch to an early boot handler that just does a basic crash, which allows it to run before ppc_md is set up. MSR[ME] is enabled on the boot CPU earlier, and the machine check stack is temporarily set to the middle of the init task stack. This allows machine checks (e.g., due to invalid data access in real mode) to print something useful earlier in boot (as soon as udbg is set up, if CONFIG_PPC_EARLY_DEBUG=y). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926055620.2676869-3-npiggin@gmail.com --- arch/powerpc/include/asm/asm-prototypes.h | 1 + arch/powerpc/kernel/exceptions-64s.S | 6 +++++- arch/powerpc/kernel/setup_64.c | 14 ++++++++++++++ arch/powerpc/kernel/traps.c | 14 ++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index e1b3e90eec94..274bce76f5da 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -36,6 +36,7 @@ int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, int64_t opcode, uint64_t msr); /* misc runtime */ +void enable_machine_check(void); extern u64 __bswapdi2(u64); extern s64 __lshrdi3(s64, int); extern s64 __ashldi3(s64, int); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c1ce7deeeb6e..8d658c740db8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1134,6 +1134,7 @@ INT_DEFINE_BEGIN(machine_check) INT_DEFINE_END(machine_check) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) + EARLY_BOOT_FIXUP GEN_INT_ENTRY machine_check_early, virt=0 EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) @@ -1198,6 +1199,9 @@ BEGIN_FTR_SECTION bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_FTR_SECTION + bl machine_check_early_boot +END_FTR_SECTION(0, 1) // nop out after boot bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) @@ -3096,7 +3100,7 @@ CLOSE_FIXED_SECTION(virt_trampolines); USE_TEXT_SECTION() /* MSR[RI] should be clear because this uses SRR[01] */ -enable_machine_check: +_GLOBAL(enable_machine_check) mflr r0 bcl 20,31,$+4 0: mflr r3 diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e1ac803c9046..f50476da2f2c 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -180,6 +181,16 @@ static void __init fixup_boot_paca(void) { /* The boot cpu is started */ get_paca()->cpu_start = 1; +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Give the early boot machine check stack somewhere to use, use + * half of the init stack. This is a bit hacky but there should not be + * deep stack usage in early init so shouldn't overflow it or overwrite + * things. + */ + get_paca()->mc_emergency_sp = (void *)&init_thread_union + + (THREAD_SIZE/2); +#endif /* Allow percpu accesses to work until we setup percpu data */ get_paca()->data_offset = 0; /* Mark interrupts soft and hard disabled in PACA */ @@ -357,6 +368,9 @@ void __init early_setup(unsigned long dt_ptr) /* -------- printk is now safe to use ------- */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && (mfmsr() & MSR_HV)) + enable_machine_check(); + /* Try new device tree based feature discovery ... */ if (!dt_cpu_ftrs_init(__va(dt_ptr))) /* Otherwise use the old style CPU table */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 62ec50a7a8ef..9bdd79aa51cf 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -68,6 +68,7 @@ #include #include #include +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) int (*__debugger)(struct pt_regs *regs) __read_mostly; @@ -850,6 +851,19 @@ bail: } #ifdef CONFIG_PPC_BOOK3S_64 +DEFINE_INTERRUPT_HANDLER_RAW(machine_check_early_boot) +{ + udbg_printf("Machine check (early boot)\n"); + udbg_printf("SRR0=0x%016lx SRR1=0x%016lx\n", regs->nip, regs->msr); + udbg_printf(" DAR=0x%016lx DSISR=0x%08lx\n", regs->dar, regs->dsisr); + udbg_printf(" LR=0x%016lx R1=0x%08lx\n", regs->link, regs->gpr[1]); + udbg_printf("------\n"); + die("Machine check (early boot)", regs, SIGBUS); + for (;;) + ; + return 0; +} + DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async) { __machine_check_exception(regs); -- cgit v1.2.3 From 6bd7ff497b4af13ea3d53781ffca7dc744dbb4da Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Tue, 23 Aug 2022 01:17:51 +0200 Subject: powerpc/udbg: Remove extern function prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'extern' keyword is pointless and deprecated for function prototypes. Signed-off-by: Pali Rohár Suggested-by: Gabriel Paubert Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220822231751.16973-1-pali@kernel.org --- arch/powerpc/include/asm/udbg.h | 54 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 88add5593e6f..b1f094728b35 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -15,13 +15,13 @@ extern void (*udbg_flush)(void); extern int (*udbg_getc)(void); extern int (*udbg_getc_poll)(void); -extern void udbg_puts(const char *s); -extern int udbg_write(const char *s, int n); +void udbg_puts(const char *s); +int udbg_write(const char *s, int n); -extern void register_early_udbg_console(void); -extern void udbg_printf(const char *fmt, ...) +void register_early_udbg_console(void); +void udbg_printf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -extern void udbg_progress(char *s, unsigned short hex); +void udbg_progress(char *s, unsigned short hex); void __init udbg_uart_init_mmio(void __iomem *addr, unsigned int stride); void __init udbg_uart_init_pio(unsigned long port, unsigned int stride); @@ -31,28 +31,28 @@ unsigned int __init udbg_probe_uart_speed(unsigned int clock); struct device_node; void __init udbg_scc_init(int force_scc); -extern int udbg_adb_init(int force_btext); -extern void udbg_adb_init_early(void); - -extern void __init udbg_early_init(void); -extern void __init udbg_init_debug_lpar(void); -extern void __init udbg_init_debug_lpar_hvsi(void); -extern void __init udbg_init_pmac_realmode(void); -extern void __init udbg_init_maple_realmode(void); -extern void __init udbg_init_pas_realmode(void); -extern void __init udbg_init_rtas_panel(void); -extern void __init udbg_init_rtas_console(void); -extern void __init udbg_init_btext(void); -extern void __init udbg_init_44x_as1(void); -extern void __init udbg_init_40x_realmode(void); -extern void __init udbg_init_cpm(void); -extern void __init udbg_init_usbgecko(void); -extern void __init udbg_init_memcons(void); -extern void __init udbg_init_ehv_bc(void); -extern void __init udbg_init_ps3gelic(void); -extern void __init udbg_init_debug_opal_raw(void); -extern void __init udbg_init_debug_opal_hvsi(void); -extern void __init udbg_init_debug_16550(void); +int udbg_adb_init(int force_btext); +void udbg_adb_init_early(void); + +void __init udbg_early_init(void); +void __init udbg_init_debug_lpar(void); +void __init udbg_init_debug_lpar_hvsi(void); +void __init udbg_init_pmac_realmode(void); +void __init udbg_init_maple_realmode(void); +void __init udbg_init_pas_realmode(void); +void __init udbg_init_rtas_panel(void); +void __init udbg_init_rtas_console(void); +void __init udbg_init_btext(void); +void __init udbg_init_44x_as1(void); +void __init udbg_init_40x_realmode(void); +void __init udbg_init_cpm(void); +void __init udbg_init_usbgecko(void); +void __init udbg_init_memcons(void); +void __init udbg_init_ehv_bc(void); +void __init udbg_init_ps3gelic(void); +void __init udbg_init_debug_opal_raw(void); +void __init udbg_init_debug_opal_hvsi(void); +void __init udbg_init_debug_16550(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ -- cgit v1.2.3 From bbd71709087a9d486d1da42399eec14e106072f2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 28 Sep 2022 01:04:18 +1000 Subject: powerpc: Make stack frame marker upper case Now that the stack frame regs marker is only 32-bits it is not as obvious in memory dumps and easier to miss, eg: c000000004733e40 0000000000000000 0000000000000000 |................| c000000004733e50 0000000000000000 0000000000000000 |................| c000000004733e60 0000000000000000 0000000000000000 |................| c000000004733e70 7367657200000000 0000000000000000 |sger............| c000000004733e80 a700000000000000 708897f7ff7f0000 |........p.......| c000000004733e90 0073428fff7f0000 208997f7ff7f0000 |.sB..... .......| c000000004733ea0 0100000000000000 ffffffffffffffff |................| c000000004733eb0 0000000000000000 0000000000000000 |................| So make it upper case to make it stand out a bit more: c000000004733e70 5347455200000000 0000000000000000 |SGER............| Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220927150419.1503001-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 5b496e589d54..6c23d1d25dc7 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -99,7 +99,7 @@ struct pt_regs #define STACK_FRAME_WITH_PT_REGS (STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)) -#define STACK_FRAME_REGS_MARKER ASM_CONST(0x72656773) +#define STACK_FRAME_REGS_MARKER ASM_CONST(0x52454753) #ifdef __powerpc64__ -- cgit v1.2.3 From 19c95df1277c48e3ef8cc7d9f1d315dce949f203 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 28 Sep 2022 01:04:19 +1000 Subject: powerpc: Reverse stack frame marker on little endian On little endian the stack frame marker appears reversed when dumping memory sequentially, as is typical in xmon or gdb, eg: c000000004733e40 0000000000000000 0000000000000000 |................| c000000004733e50 0000000000000000 0000000000000000 |................| c000000004733e60 0000000000000000 0000000000000000 |................| c000000004733e70 5347455200000000 0000000000000000 |SGER............| c000000004733e80 a700000000000000 708897f7ff7f0000 |........p.......| c000000004733e90 0073428fff7f0000 208997f7ff7f0000 |.sB..... .......| c000000004733ea0 0100000000000000 ffffffffffffffff |................| c000000004733eb0 0000000000000000 0000000000000000 |................| To make it easier to recognise, reverse the value on little endian, so it always appears as "REGS", eg: c000000004733e70 5245475300000000 0000000000000000 |REGS............| Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220927150419.1503001-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 6c23d1d25dc7..2efec6d87049 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -99,7 +99,12 @@ struct pt_regs #define STACK_FRAME_WITH_PT_REGS (STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)) +// Always displays as "REGS" in memory dumps +#ifdef CONFIG_CPU_BIG_ENDIAN #define STACK_FRAME_REGS_MARKER ASM_CONST(0x52454753) +#else +#define STACK_FRAME_REGS_MARKER ASM_CONST(0x53474552) +#endif #ifdef __powerpc64__ -- cgit v1.2.3 From 335e1a91042764629fbbcd8c7e40379fa3762d35 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 27 Sep 2022 18:29:27 -0700 Subject: powerpc: Ignore DSI error caused by the copy/paste instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The data storage interrupt (DSI) error will be generated when the paste operation is issued on the suspended Nest Accelerator (NX) window due to NX state changes. The hypervisor expects the partition to ignore this error during page fault handling. To differentiate DSI caused by an actual HW configuration or by the NX window, a new “ibm,pi-features” type value is defined. Byte 0, bit 3 of pi-attribute-specifier-type is now defined to indicate this DSI error. If this error is not ignored, the user space can get SIGBUS when the NX request is issued. This patch adds changes to read ibm,pi-features property and ignore DSI error during page fault handling if MMU_FTR_NX_DSI is defined. Signed-off-by: Haren Myneni [mpe: Mention PAPR version in comment] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b9cd844b85eb8f70459109ce1b14e44c4cc85fa7.camel@linux.ibm.com --- arch/powerpc/include/asm/mmu.h | 5 ++++- arch/powerpc/kernel/prom.c | 36 ++++++++++++++++++++++++++---------- arch/powerpc/mm/fault.c | 17 ++++++++++++++++- 3 files changed, 46 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 39057320e436..94b981152667 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -120,6 +120,9 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +// NX paste RMA reject in DSI +#define MMU_FTR_NX_DSI ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 (MMU_FTR_HPTE_TABLE | MMU_FTR_TLBIEL | MMU_FTR_16M_PAGE) #define MMU_FTRS_POWER MMU_FTRS_DEFAULT_HPTE_ARCH_V2 @@ -181,7 +184,7 @@ enum { #endif #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | - MMU_FTR_GTSE | + MMU_FTR_GTSE | MMU_FTR_NX_DSI | #endif /* CONFIG_PPC_RADIX_MMU */ #endif #ifdef CONFIG_PPC_KUAP diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index a730b951b64b..2e7a04dab2f7 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -137,7 +137,7 @@ static void __init move_device_tree(void) } /* - * ibm,pa-features is a per-cpu property that contains a string of + * ibm,pa/pi-features is a per-cpu property that contains a string of * attribute descriptors, each of which has a 2 byte header plus up * to 254 bytes worth of processor attribute bits. First header * byte specifies the number of bytes following the header. @@ -149,15 +149,17 @@ static void __init move_device_tree(void) * is supported/not supported. Note that the bit numbers are * big-endian to match the definition in PAPR. */ -static struct ibm_pa_feature { +struct ibm_feature { unsigned long cpu_features; /* CPU_FTR_xxx bit */ unsigned long mmu_features; /* MMU_FTR_xxx bit */ unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */ unsigned int cpu_user_ftrs2; /* PPC_FEATURE2_xxx bit */ - unsigned char pabyte; /* byte number in ibm,pa-features */ + unsigned char pabyte; /* byte number in ibm,pa/pi-features */ unsigned char pabit; /* bit number (big-endian) */ unsigned char invert; /* if 1, pa bit set => clear feature */ -} ibm_pa_features[] __initdata = { +}; + +static struct ibm_feature ibm_pa_features[] __initdata = { { .pabyte = 0, .pabit = 0, .cpu_user_ftrs = PPC_FEATURE_HAS_MMU }, { .pabyte = 0, .pabit = 1, .cpu_user_ftrs = PPC_FEATURE_HAS_FPU }, { .pabyte = 0, .pabit = 3, .cpu_features = CPU_FTR_CTRL }, @@ -179,9 +181,19 @@ static struct ibm_pa_feature { { .pabyte = 64, .pabit = 0, .cpu_features = CPU_FTR_DAWR1 }, }; +/* + * ibm,pi-features property provides the support of processor specific + * options not described in ibm,pa-features. Right now use byte 0, bit 3 + * which indicates the occurrence of DSI interrupt when the paste operation + * on the suspended NX window. + */ +static struct ibm_feature ibm_pi_features[] __initdata = { + { .pabyte = 0, .pabit = 3, .mmu_features = MMU_FTR_NX_DSI }, +}; + static void __init scan_features(unsigned long node, const unsigned char *ftrs, unsigned long tablelen, - struct ibm_pa_feature *fp, + struct ibm_feature *fp, unsigned long ft_size) { unsigned long i, len, bit; @@ -218,17 +230,18 @@ static void __init scan_features(unsigned long node, const unsigned char *ftrs, } } -static void __init check_cpu_pa_features(unsigned long node) +static void __init check_cpu_features(unsigned long node, char *name, + struct ibm_feature *fp, + unsigned long size) { const unsigned char *pa_ftrs; int tablelen; - pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen); + pa_ftrs = of_get_flat_dt_prop(node, name, &tablelen); if (pa_ftrs == NULL) return; - scan_features(node, pa_ftrs, tablelen, - ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); + scan_features(node, pa_ftrs, tablelen, fp, size); } #ifdef CONFIG_PPC_64S_HASH_MMU @@ -380,7 +393,10 @@ static int __init early_init_dt_scan_cpus(unsigned long node, identify_cpu(0, be32_to_cpup(prop)); check_cpu_feature_properties(node); - check_cpu_pa_features(node); + check_cpu_features(node, "ibm,pa-features", ibm_pa_features, + ARRAY_SIZE(ibm_pa_features)); + check_cpu_features(node, "ibm,pi-features", ibm_pi_features, + ARRAY_SIZE(ibm_pi_features)); } identical_pvr_fixup(node); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 1566804e4b3d..2bef19cc1b98 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -371,7 +371,22 @@ static void sanity_check_fault(bool is_write, bool is_user, #elif defined(CONFIG_PPC_8xx) #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) #elif defined(CONFIG_PPC64) -#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +static int page_fault_is_bad(unsigned long err) +{ + unsigned long flag = DSISR_BAD_FAULT_64S; + + /* + * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) + * If byte 0, bit 3 of pi-attribute-specifier-type in + * ibm,pi-features property is defined, ignore the DSI error + * which is caused by the paste instruction on the + * suspended NX window. + */ + if (mmu_has_feature(MMU_FTR_NX_DSI)) + flag &= ~DSISR_BAD_COPYPASTE; + + return err & flag; +} #else #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) #endif -- cgit v1.2.3 From d368e0c478a628f36680650f8d1d1634037b046e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 7 Sep 2022 13:49:41 +0530 Subject: powerpc/mm/book3s/hash: Rename flush_tlb_pmd_range This function does the hash page table update. Hence rename it to indicate this better to avoid confusion with flush_pmd_tlb_range() Signed-off-by: Aneesh Kumar K.V [mpe: Drop unnecessary extern] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220907081941.209501-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 4 +--- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/hash_tlb.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 8b762f282190..fab8332fe1ad 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -112,13 +112,11 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start, struct mmu_gather; extern void hash__tlb_flush(struct mmu_gather *tlb); -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); #ifdef CONFIG_PPC_64S_HASH_MMU /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(unsigned long start, unsigned long end); -extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr); +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); #else static inline void __flush_hash_table_range(unsigned long start, unsigned long end) { } #endif diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 28332001bd87..747492edb75a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -256,7 +256,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * the __collapse_huge_page_copy can result in copying * the old content. */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index eb0bccaf221e..a64ea0a7ef96 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; -- cgit v1.2.3 From 41dc056391b334fae646b55ee020bfa8f67b60c8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 30 Sep 2022 18:27:04 +1000 Subject: powerpc: Add hardware description string Create a hardware description string, which we will use to record various details of the hardware platform we are running on. Print the accumulated description at boot, and use it to set the generic description which is printed in oopses. To begin with add ppc_md.name, aka the "machine description". Example output at boot with the full series applied: Linux version 6.0.0-rc2-gcc-11.1.0-00199-g893f9007a5ce-dirty (michael@alpine1-p1) (powerpc64-linux-gcc (GCC) 11.1.0, GNU ld (GNU Binutils) 2.36.1) #844 SMP Thu Sep 29 22:29:53 AEST 2022 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf000005 of:SLOF,git-5b4c5a pSeries printk: bootconsole [udbg0] enabled Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220930082709.55830-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/setup.h | 2 ++ arch/powerpc/kernel/setup-common.c | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 85143849a586..e29e83f8a89c 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -88,6 +88,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp, unsigned long r6, unsigned long r7, unsigned long kbase); +extern struct seq_buf ppc_hw_desc; + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index dd98f43bd685..6d041993a45d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -588,6 +590,15 @@ static __init int add_pcspkr(void) device_initcall(add_pcspkr); #endif /* CONFIG_PCSPKR_PLATFORM */ +static char ppc_hw_desc_buf[128] __initdata; + +struct seq_buf ppc_hw_desc __initdata = { + .buffer = ppc_hw_desc_buf, + .size = sizeof(ppc_hw_desc_buf), + .len = 0, + .readpos = 0, +}; + static __init void probe_machine(void) { extern struct machdep_calls __machine_desc_start; @@ -628,7 +639,13 @@ static __init void probe_machine(void) for (;;); } - printk(KERN_INFO "Using %s machine description\n", ppc_md.name); + // Append the machine name to other info we've gathered + seq_buf_puts(&ppc_hw_desc, ppc_md.name); + + // Set the generic hardware description shown in oopses + dump_stack_set_arch_desc(ppc_hw_desc.buffer); + + pr_info("Hardware name: %s\n", ppc_hw_desc.buffer); } /* Match a class of boards, not a specific device configuration. */ -- cgit v1.2.3 From 0fa6831811f62cfc10415d731bcf9fde2647ad81 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 4 Oct 2022 15:11:57 +1000 Subject: powerpc/64: Fix msr_check_and_set/clear MSR[EE] race irq soft-masking means that when Linux irqs are disabled, the MSR[EE] value can change from 1 to 0 asynchronously: if a masked interrupt of the PACA_IRQ_MUST_HARD_MASK variety fires while irqs are disabled, the masked handler will return with MSR[EE]=0. This means a sequence like mtmsr(mfmsr() | MSR_FP) is racy if it can be called with local irqs disabled, unless a hard_irq_disable has been done. Reported-by: Sachin Sant Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221004051157.308999-2-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 24 ++++++++++++++++++++++++ arch/powerpc/kernel/process.c | 4 ++-- 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index e8de249339d8..77fa88c2aed0 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -471,6 +471,30 @@ static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned l } #endif /* CONFIG_PPC64 */ +static inline unsigned long mtmsr_isync_irqsafe(unsigned long msr) +{ +#ifdef CONFIG_PPC64 + if (arch_irqs_disabled()) { + /* + * With soft-masking, MSR[EE] can change from 1 to 0 + * asynchronously when irqs are disabled, and we don't want to + * set MSR[EE] back to 1 here if that has happened. A race-free + * way to do this is ensure EE is already 0. Another way it + * could be done is with a RESTART_TABLE handler, but that's + * probably overkill here. + */ + msr &= ~MSR_EE; + mtmsr_isync(msr); + irq_soft_mask_set(IRQS_ALL_DISABLED); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + } else +#endif + mtmsr_isync(msr); + + return msr; +} + + #define ARCH_IRQ_INIT_FLAGS IRQ_NOREQUEST #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb..37df0428e4fb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -127,7 +127,7 @@ unsigned long notrace msr_check_and_set(unsigned long bits) newmsr |= MSR_VSX; if (oldmsr != newmsr) - mtmsr_isync(newmsr); + newmsr = mtmsr_isync_irqsafe(newmsr); return newmsr; } @@ -145,7 +145,7 @@ void notrace __msr_check_and_clear(unsigned long bits) newmsr &= ~MSR_VSX; if (oldmsr != newmsr) - mtmsr_isync(newmsr); + mtmsr_isync_irqsafe(newmsr); } EXPORT_SYMBOL(__msr_check_and_clear); -- cgit v1.2.3 From 94746890202cf18e5266b4de77895243e55b0a79 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Oct 2022 23:34:17 +1100 Subject: powerpc: Don't add __powerpc_ prefix to syscall entry points When using syscall wrappers the __SYSCALL_DEFINEx() and related macros add a "__powerpc_" prefix to all syscall entry points. So for example sys_mmap becomes __powerpc_sys_mmap. This risks breaking workflows and tools that expect the old naming scheme. At a minimum setting a breakpoint on eg. sys_mmap with gdb no longer works. There seems to be no compelling reason to add the "__powerpc_" prefix, other than that it follows what some other arches do (x86, arm64, s390). But unlike other arches powerpc doesn't always enable syscall wrappers, so the syscall entry points can change name depending on CONFIG options. For those reasons drop the "__powerpc_" prefix, reverting to the existing naming. Doing so reveals two prototypes in signal.h that have the incorrect type when syscall wrappers are enabled. There are already prototypes for both functions in syscalls.h, so drop the ones from signal.h. Fixes: 7e92e01b7245 ("powerpc: Provide syscall wrapper") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221006135940.1223988-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/syscall_wrapper.h | 18 ++++++++---------- arch/powerpc/include/asm/syscalls.h | 2 +- arch/powerpc/kernel/signal.h | 3 --- arch/powerpc/kernel/systbl.c | 3 +-- 4 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscall_wrapper.h b/arch/powerpc/include/asm/syscall_wrapper.h index 75b41b173f7a..67486c67e8a2 100644 --- a/arch/powerpc/include/asm/syscall_wrapper.h +++ b/arch/powerpc/include/asm/syscall_wrapper.h @@ -16,11 +16,11 @@ struct pt_regs; ,,regs->gpr[6],,regs->gpr[7],,regs->gpr[8]) #define __SYSCALL_DEFINEx(x, name, ...) \ - long __powerpc_sys##name(const struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__powerpc_sys##name, ERRNO); \ + long sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(sys##name, ERRNO); \ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __powerpc_sys##name(const struct pt_regs *regs) \ + long sys##name(const struct pt_regs *regs) \ { \ return __se_sys##name(SC_POWERPC_REGS_TO_ARGS(x,__VA_ARGS__)); \ } \ @@ -35,17 +35,15 @@ struct pt_regs; #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ - long __powerpc_sys_##sname(const struct pt_regs *__unused); \ - ALLOW_ERROR_INJECTION(__powerpc_sys_##sname, ERRNO); \ - long __powerpc_sys_##sname(const struct pt_regs *__unused) + long sys_##sname(const struct pt_regs *__unused); \ + ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \ + long sys_##sname(const struct pt_regs *__unused) #define COND_SYSCALL(name) \ - long __powerpc_sys_##name(const struct pt_regs *regs); \ - long __weak __powerpc_sys_##name(const struct pt_regs *regs) \ + long sys_##name(const struct pt_regs *regs); \ + long __weak sys_##name(const struct pt_regs *regs) \ { \ return sys_ni_syscall(); \ } -#define SYS_NI(name) SYSCALL_ALIAS(__powerpc_sys_##name, sys_ni_posix_timers); - #endif // __ASM_POWERPC_SYSCALL_WRAPPER_H diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 49bbc3e0733d..9840d572da55 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -124,7 +124,7 @@ long sys_ppc_fadvise64_64(int fd, int advice, #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #define __SYSCALL(nr, entry) \ - long __powerpc_##entry(const struct pt_regs *regs); + long entry(const struct pt_regs *regs); #ifdef CONFIG_PPC64 #include diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 618aeccdf691..a429c57ed433 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -196,9 +196,6 @@ extern int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, #else /* CONFIG_PPC64 */ -extern long sys_rt_sigreturn(void); -extern long sys_sigreturn(void); - static inline int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct task_struct *tsk) { diff --git a/arch/powerpc/kernel/systbl.c b/arch/powerpc/kernel/systbl.c index 9d7c5a596171..4305f2a2162f 100644 --- a/arch/powerpc/kernel/systbl.c +++ b/arch/powerpc/kernel/systbl.c @@ -20,8 +20,7 @@ #undef __SYSCALL #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER -#define __SYSCALL(nr, entry) [nr] = __powerpc_##entry, -#define __powerpc_sys_ni_syscall sys_ni_syscall +#define __SYSCALL(nr, entry) [nr] = entry, #else /* * Coerce syscall handlers with arbitrary parameters to common type -- cgit v1.2.3 From e237506238352f3bfa9cf3983cdab873e35651eb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Oct 2022 13:53:34 +1000 Subject: powerpc/32: fix syscall wrappers with 64-bit arguments of unaligned register-pairs powerpc 32-bit system call (and function) calling convention for 64-bit arguments requires the next available odd-pair (two sequential registers with the first being odd-numbered) from the standard register argument allocation. The first argument register is r3, so a 64-bit argument that appears at an even position in the argument list must skip a register (unless there were preceding 64-bit arguments, which might throw things off). This requires non-standard compat definitions to deal with the holes in the argument register allocation. With pt_regs syscall wrappers which use a standard mapper to map pt_regs GPRs to function arguments, 32-bit kernels hit the same basic problem, the standard definitions don't cope with the unused argument registers. Fix this by having 32-bit kernels share those syscall definitions with compat. Thanks to Jason for spending a lot of time finding and bisecting this and developing a trivial reproducer. The perfect bug report. Reported-by: Jason A. Donenfeld Signed-off-by: Nicholas Piggin Fixes: 7e92e01b72452 ("powerpc: Provide syscall wrapper") Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221012035335.866440-1-npiggin@gmail.com --- arch/powerpc/include/asm/syscalls.h | 16 ++++++++++++++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/sys_ppc32.c | 38 ++++++++++++++++++++++++-------- arch/powerpc/kernel/syscalls/syscall.tbl | 16 +++++++++----- 4 files changed, 56 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 9840d572da55..a1142496cd58 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -89,6 +89,22 @@ long compat_sys_rt_sigreturn(void); * responsible for combining parameter pairs. */ +#ifdef CONFIG_PPC32 +long sys_ppc_pread64(unsigned int fd, + char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long sys_ppc_pwrite64(unsigned int fd, + const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long sys_ppc_readahead(int fd, u32 r4, + u32 offset1, u32 offset2, u32 count); +long sys_ppc_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); +long sys_ppc_ftruncate64(unsigned int fd, u32 reg4, + unsigned long len1, unsigned long len2); +long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); +#endif #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ee2d76cb3187..9b6146056e48 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -73,6 +73,7 @@ obj-y := cputable.o syscalls.o \ obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o irq_64.o\ paca.o nvram_64.o note.o +obj-$(CONFIG_PPC32) += sys_ppc32.o obj-$(CONFIG_COMPAT) += sys_ppc32.o signal_32.o obj-$(CONFIG_VDSO32) += vdso32_wrapper.o obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index dcc3c9fd4cfd..1ab4a4d95aba 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -1,13 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * sys_ppc32.c: Conversion between 32bit and 64bit native syscalls. + * sys_ppc32.c: 32-bit system calls with complex calling conventions. * * Copyright (C) 2001 IBM * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) * - * These routines maintain argument size conversion between 32bit and 64bit - * environment. + * 32-bit system calls with 64-bit arguments pass those in register pairs. + * This must be specially dealt with on 64-bit kernels. The compat_arg_u64_dual + * in generic compat syscalls is not always usable because the register + * pairing is constrained depending on preceding arguments. + * + * An analogous problem exists on 32-bit kernels with ARCH_HAS_SYSCALL_WRAPPER, + * the defined system call functions take the pt_regs as an argument, and there + * is a mapping macro which maps registers to arguments + * (SC_POWERPC_REGS_TO_ARGS) which also does not deal with these 64-bit + * arguments. + * + * This file contains these system calls. */ #include @@ -47,7 +57,17 @@ #include #include -COMPAT_SYSCALL_DEFINE6(ppc_pread64, +#ifdef CONFIG_PPC32 +#define PPC32_SYSCALL_DEFINE4 SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 SYSCALL_DEFINE6 +#else +#define PPC32_SYSCALL_DEFINE4 COMPAT_SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 COMPAT_SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 COMPAT_SYSCALL_DEFINE6 +#endif + +PPC32_SYSCALL_DEFINE6(ppc_pread64, unsigned int, fd, char __user *, ubuf, compat_size_t, count, u32, reg6, u32, pos1, u32, pos2) @@ -55,7 +75,7 @@ COMPAT_SYSCALL_DEFINE6(ppc_pread64, return ksys_pread64(fd, ubuf, count, merge_64(pos1, pos2)); } -COMPAT_SYSCALL_DEFINE6(ppc_pwrite64, +PPC32_SYSCALL_DEFINE6(ppc_pwrite64, unsigned int, fd, const char __user *, ubuf, compat_size_t, count, u32, reg6, u32, pos1, u32, pos2) @@ -63,28 +83,28 @@ COMPAT_SYSCALL_DEFINE6(ppc_pwrite64, return ksys_pwrite64(fd, ubuf, count, merge_64(pos1, pos2)); } -COMPAT_SYSCALL_DEFINE5(ppc_readahead, +PPC32_SYSCALL_DEFINE5(ppc_readahead, int, fd, u32, r4, u32, offset1, u32, offset2, u32, count) { return ksys_readahead(fd, merge_64(offset1, offset2), count); } -COMPAT_SYSCALL_DEFINE4(ppc_truncate64, +PPC32_SYSCALL_DEFINE4(ppc_truncate64, const char __user *, path, u32, reg4, unsigned long, len1, unsigned long, len2) { return ksys_truncate(path, merge_64(len1, len2)); } -COMPAT_SYSCALL_DEFINE4(ppc_ftruncate64, +PPC32_SYSCALL_DEFINE4(ppc_ftruncate64, unsigned int, fd, u32, reg4, unsigned long, len1, unsigned long, len2) { return ksys_ftruncate(fd, merge_64(len1, len2)); } -COMPAT_SYSCALL_DEFINE6(ppc32_fadvise64, +PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, int, fd, u32, unused, u32, offset1, u32, offset2, size_t, len, int, advice) { diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 2bca64f96164..e9e0df4f9a61 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -228,8 +228,10 @@ 176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -179 common pread64 sys_pread64 compat_sys_ppc_pread64 -180 common pwrite64 sys_pwrite64 compat_sys_ppc_pwrite64 +179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 +179 64 pread64 sys_pread64 +180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 +180 64 pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -242,10 +244,11 @@ 188 common putpmsg sys_ni_syscall 189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit -191 common readahead sys_readahead compat_sys_ppc_readahead +191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead +191 64 readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 -193 32 truncate64 sys_truncate64 compat_sys_ppc_truncate64 -194 32 ftruncate64 sys_ftruncate64 compat_sys_ppc_ftruncate64 +193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 +194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 195 32 stat64 sys_stat64 196 32 lstat64 sys_lstat64 197 32 fstat64 sys_fstat64 @@ -288,7 +291,8 @@ 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address -233 common fadvise64 sys_fadvise64 compat_sys_ppc32_fadvise64 +233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 +233 64 fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie 236 common epoll_create sys_epoll_create -- cgit v1.2.3 From b9ef323ea1682f9837bf63ba10c5e3750f71a20a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:45 +1000 Subject: powerpc/64s: Disable preemption in hash lazy mmu mode apply_to_page_range on kernel pages does not disable preemption, which is a requirement for hash's lazy mmu mode, which keeps track of the TLBs to flush with a per-cpu array. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) -- cgit v1.2.3 From ce883a2ba310cd7c291bb66ce5d207965fca6003 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 31 Oct 2022 15:47:35 +0100 Subject: powerpc/32: fix syscall wrappers with 64-bit arguments With the introduction of syscall wrappers all wrappers for syscalls with 64-bit arguments must be handled specially, not only those that have unaligned 64-bit arguments. This left out the fallocate() and sync_file_range2() syscalls. Fixes: 7e92e01b7245 ("powerpc: Provide syscall wrapper") Fixes: e23750623835 ("powerpc/32: fix syscall wrappers with 64-bit arguments of unaligned register-pairs") Signed-off-by: Andreas Schwab Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/87mt9cxd6g.fsf_-_@igel.home --- arch/powerpc/include/asm/syscalls.h | 7 +++++++ arch/powerpc/kernel/sys_ppc32.c | 13 ++++++++++++- arch/powerpc/kernel/syscalls/syscall.tbl | 7 +++++-- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index a1142496cd58..6d51b007b59e 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -104,6 +104,13 @@ long sys_ppc_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2); long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, size_t len, int advice); +long sys_ppc_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, + unsigned int offset2, + unsigned int nbytes1, + unsigned int nbytes2); +long sys_ppc_fallocate(int fd, int mode, u32 offset1, u32 offset2, + u32 len1, u32 len2); #endif #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 1ab4a4d95aba..d451a8229223 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -112,7 +112,7 @@ PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, advice); } -COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, +PPC32_SYSCALL_DEFINE6(ppc_sync_file_range2, int, fd, unsigned int, flags, unsigned int, offset1, unsigned int, offset2, unsigned int, nbytes1, unsigned int, nbytes2) @@ -122,3 +122,14 @@ COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, return ksys_sync_file_range(fd, offset, nbytes, flags); } + +#ifdef CONFIG_PPC32 +SYSCALL_DEFINE6(ppc_fallocate, + int, fd, int, mode, + u32, offset1, u32, offset2, u32, len1, u32, len2) +{ + return ksys_fallocate(fd, mode, + merge_64(offset1, offset2), + merge_64(len1, len2)); +} +#endif diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index e9e0df4f9a61..a0be127475b1 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -394,8 +394,11 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2 -309 nospu fallocate sys_fallocate compat_sys_fallocate +308 32 sync_file_range2 sys_ppc_sync_file_range2 compat_sys_ppc_sync_file_range2 +308 64 sync_file_range2 sys_sync_file_range2 +308 spu sync_file_range2 sys_sync_file_range2 +309 32 fallocate sys_ppc_fallocate compat_sys_fallocate +309 64 fallocate sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 311 64 timerfd_settime sys_timerfd_settime -- cgit v1.2.3 From 2e7ec190a0e38aaa8a6d87fd5f804ec07947febc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 25 Nov 2022 22:34:42 +1100 Subject: powerpc/64s: Add missing declaration for machine_check_early_boot() There's no declaration for machine_check_early_boot(), which leads to a build failure with W=1. Add one. Fixes: 2f5182cffa43 ("powerpc/64s: early boot machine check handler") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221125132521.2167039-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/interrupt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 4745bb9998bd..6d8492b6e2b8 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -602,6 +602,7 @@ ____##func(struct pt_regs *regs) /* kernel/traps.c */ DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception); #ifdef CONFIG_PPC_BOOK3S_64 +DECLARE_INTERRUPT_HANDLER_RAW(machine_check_early_boot); DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async); #endif DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception); -- cgit v1.2.3