From c5f4546593e9911800f0926c1090959b58bc5c93 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 16 Dec 2014 11:58:18 -0600 Subject: livepatch: kernel: add TAINT_LIVEPATCH This adds a new taint flag to indicate when the kernel or a kernel module has been live patched. This will provide a clean indication in bug reports that live patching was used. Additionally, if the crash occurs in a live patched function, the live patch module will appear beside the patched function in the backtrace. Signed-off-by: Seth Jennings Acked-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Reviewed-by: Petr Mladek Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Kosina --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 4d8d6f906dec..8136ad76e5fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -226,6 +226,7 @@ static const struct tnt tnts[] = { { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, { TAINT_SOFTLOCKUP, 'L', ' ' }, + { TAINT_LIVEPATCH, 'K', ' ' }, }; /** @@ -246,6 +247,7 @@ static const struct tnt tnts[] = { * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. * 'L' - A soft lockup has previously occurred. + * 'K' - Kernel has been live patched. * * The string is overwritten by the next call to print_tainted(). */ -- cgit v1.2.3 From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 16 Dec 2014 11:58:19 -0600 Subject: livepatch: kernel: add support for live patching This commit introduces code for the live patching core. It implements an ftrace-based mechanism and kernel interface for doing live patching of kernel and kernel module functions. It represents the greatest common functionality set between kpatch and kgraft and can accept patches built using either method. This first version does not implement any consistency mechanism that ensures that old and new code do not run together. In practice, ~90% of CVEs are safe to apply in this way, since they simply add a conditional check. However, any function change that can not execute safely with the old version of the function can _not_ be safely applied in this version. [ jkosina@suse.cz: due to the number of contributions that got folded into this original patch from Seth Jennings, add SUSE's copyright as well, as discussed via e-mail ] Signed-off-by: Seth Jennings Signed-off-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Reviewed-by: Petr Mladek Reviewed-by: Masami Hiramatsu Signed-off-by: Miroslav Benes Signed-off-by: Petr Mladek Signed-off-by: Jiri Kosina --- Documentation/ABI/testing/sysfs-kernel-livepatch | 44 ++ MAINTAINERS | 13 + arch/x86/Kconfig | 3 + arch/x86/include/asm/livepatch.h | 37 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/livepatch.c | 90 +++ include/linux/livepatch.h | 133 ++++ kernel/Makefile | 1 + kernel/livepatch/Kconfig | 18 + kernel/livepatch/Makefile | 3 + kernel/livepatch/core.c | 930 +++++++++++++++++++++++ 11 files changed, 1273 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-livepatch create mode 100644 arch/x86/include/asm/livepatch.h create mode 100644 arch/x86/kernel/livepatch.c create mode 100644 include/linux/livepatch.h create mode 100644 kernel/livepatch/Kconfig create mode 100644 kernel/livepatch/Makefile create mode 100644 kernel/livepatch/core.c (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch new file mode 100644 index 000000000000..5bf42a840b22 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -0,0 +1,44 @@ +What: /sys/kernel/livepatch +Date: Nov 2014 +KernelVersion: 3.19.0 +Contact: live-patching@vger.kernel.org +Description: + Interface for kernel live patching + + The /sys/kernel/livepatch directory contains subdirectories for + each loaded live patch module. + +What: /sys/kernel/livepatch/ +Date: Nov 2014 +KernelVersion: 3.19.0 +Contact: live-patching@vger.kernel.org +Description: + The patch directory contains subdirectories for each kernel + object (vmlinux or a module) in which it patched functions. + +What: /sys/kernel/livepatch//enabled +Date: Nov 2014 +KernelVersion: 3.19.0 +Contact: live-patching@vger.kernel.org +Description: + A writable attribute that indicates whether the patched + code is currently applied. Writing 0 will disable the patch + while writing 1 will re-enable the patch. + +What: /sys/kernel/livepatch// +Date: Nov 2014 +KernelVersion: 3.19.0 +Contact: live-patching@vger.kernel.org +Description: + The object directory contains subdirectories for each function + that is patched within the object. + +What: /sys/kernel/livepatch/// +Date: Nov 2014 +KernelVersion: 3.19.0 +Contact: live-patching@vger.kernel.org +Description: + The function directory contains attributes regarding the + properties and state of the patched function. + + There are currently no such attributes. diff --git a/MAINTAINERS b/MAINTAINERS index ddb9ac8d32b3..df6a0784b466 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5784,6 +5784,19 @@ F: Documentation/misc-devices/lis3lv02d F: drivers/misc/lis3lv02d/ F: drivers/platform/x86/hp_accel.c +LIVE PATCHING +M: Josh Poimboeuf +M: Seth Jennings +M: Jiri Kosina +M: Vojtech Pavlik +S: Maintained +F: kernel/livepatch/ +F: include/linux/livepatch.h +F: arch/x86/include/asm/livepatch.h +F: arch/x86/kernel/livepatch.c +F: Documentation/ABI/testing/sysfs-kernel-livepatch +L: live-patching@vger.kernel.org + LLC (802.2) M: Arnaldo Carvalho de Melo S: Maintained diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba397bde7948..460b31b79938 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,6 +17,7 @@ config X86_64 depends on 64BIT select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_HAVE_LIVE_PATCHING ### Arch settings config X86 @@ -2008,6 +2009,8 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +source "kernel/livepatch/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h new file mode 100644 index 000000000000..d529db1b1edf --- /dev/null +++ b/arch/x86/include/asm/livepatch.h @@ -0,0 +1,37 @@ +/* + * livepatch.h - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _ASM_X86_LIVEPATCH_H +#define _ASM_X86_LIVEPATCH_H + +#include + +#ifdef CONFIG_LIVE_PATCHING +#ifndef CC_USING_FENTRY +#error Your compiler must support -mfentry for live patching to work +#endif +extern int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); + +#else +#error Live patching support is disabled; check CONFIG_LIVE_PATCHING +#endif + +#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5d4502c8b983..316b34e74c15 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVE_PATCHING) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 000000000000..ff3c3101d003 --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,90 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +/** + * klp_write_module_reloc() - write a relocation in a module + * @mod: module in which the section to be modified is found + * @type: ELF relocation type (see asm/elf.h) + * @loc: address that the relocation should be written to + * @value: relocation value (sym address + addend) + * + * This function writes a relocation to the specified location for + * a particular module. + */ +int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value) +{ + int ret, numpages, size = 4; + bool readonly; + unsigned long val; + unsigned long core = (unsigned long)mod->module_core; + unsigned long core_ro_size = mod->core_ro_size; + unsigned long core_size = mod->core_size; + + switch (type) { + case R_X86_64_NONE: + return 0; + case R_X86_64_64: + val = value; + size = 8; + break; + case R_X86_64_32: + val = (u32)value; + break; + case R_X86_64_32S: + val = (s32)value; + break; + case R_X86_64_PC32: + val = (u32)(value - loc); + break; + default: + /* unsupported relocation type */ + return -EINVAL; + } + + if (loc < core || loc >= core + core_size) + /* loc does not point to any symbol inside the module */ + return -EINVAL; + + if (loc < core + core_ro_size) + readonly = true; + else + readonly = false; + + /* determine if the relocation spans a page boundary */ + numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; + + if (readonly) + set_memory_rw(loc & PAGE_MASK, numpages); + + ret = probe_kernel_write((void *)loc, &val, size); + + if (readonly) + set_memory_ro(loc & PAGE_MASK, numpages); + + return ret; +} diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h new file mode 100644 index 000000000000..950bc615842f --- /dev/null +++ b/include/linux/livepatch.h @@ -0,0 +1,133 @@ +/* + * livepatch.h - Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LINUX_LIVEPATCH_H_ +#define _LINUX_LIVEPATCH_H_ + +#include +#include + +#if IS_ENABLED(CONFIG_LIVE_PATCHING) + +#include + +enum klp_state { + KLP_DISABLED, + KLP_ENABLED +}; + +/** + * struct klp_func - function structure for live patching + * @old_name: name of the function to be patched + * @new_func: pointer to the patched function code + * @old_addr: a hint conveying at what address the old function + * can be found (optional, vmlinux patches only) + * @kobj: kobject for sysfs resources + * @fops: ftrace operations structure + * @state: tracks function-level patch application state + */ +struct klp_func { + /* external */ + const char *old_name; + void *new_func; + /* + * The old_addr field is optional and can be used to resolve + * duplicate symbol names in the vmlinux object. If this + * information is not present, the symbol is located by name + * with kallsyms. If the name is not unique and old_addr is + * not provided, the patch application fails as there is no + * way to resolve the ambiguity. + */ + unsigned long old_addr; + + /* internal */ + struct kobject kobj; + struct ftrace_ops *fops; + enum klp_state state; +}; + +/** + * struct klp_reloc - relocation structure for live patching + * @loc: address where the relocation will be written + * @val: address of the referenced symbol (optional, + * vmlinux patches only) + * @type: ELF relocation type + * @name: name of the referenced symbol (for lookup/verification) + * @addend: offset from the referenced symbol + * @external: symbol is either exported or within the live patch module itself + */ +struct klp_reloc { + unsigned long loc; + unsigned long val; + unsigned long type; + const char *name; + int addend; + int external; +}; + +/** + * struct klp_object - kernel object structure for live patching + * @name: module name (or NULL for vmlinux) + * @relocs: relocation entries to be applied at load time + * @funcs: function entries for functions to be patched in the object + * @kobj: kobject for sysfs resources + * @mod: kernel module associated with the patched object + * (NULL for vmlinux) + * @state: tracks object-level patch application state + */ +struct klp_object { + /* external */ + const char *name; + struct klp_reloc *relocs; + struct klp_func *funcs; + + /* internal */ + struct kobject *kobj; + struct module *mod; + enum klp_state state; +}; + +/** + * struct klp_patch - patch structure for live patching + * @mod: reference to the live patch module + * @objs: object entries for kernel objects to be patched + * @list: list node for global list of registered patches + * @kobj: kobject for sysfs resources + * @state: tracks patch-level application state + */ +struct klp_patch { + /* external */ + struct module *mod; + struct klp_object *objs; + + /* internal */ + struct list_head list; + struct kobject kobj; + enum klp_state state; +}; + +extern int klp_register_patch(struct klp_patch *); +extern int klp_unregister_patch(struct klp_patch *); +extern int klp_enable_patch(struct klp_patch *); +extern int klp_disable_patch(struct klp_patch *); + +#endif /* CONFIG_LIVE_PATCHING */ + +#endif /* _LINUX_LIVEPATCH_H_ */ diff --git a/kernel/Makefile b/kernel/Makefile index a59481a3fa6c..616994f0a76f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -26,6 +26,7 @@ obj-y += power/ obj-y += printk/ obj-y += irq/ obj-y += rcu/ +obj-y += livepatch/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig new file mode 100644 index 000000000000..96da00fbc120 --- /dev/null +++ b/kernel/livepatch/Kconfig @@ -0,0 +1,18 @@ +config ARCH_HAVE_LIVE_PATCHING + boolean + help + Arch supports kernel live patching + +config LIVE_PATCHING + boolean "Kernel Live Patching" + depends on DYNAMIC_FTRACE_WITH_REGS + depends on MODULES + depends on SYSFS + depends on KALLSYMS_ALL + depends on ARCH_HAVE_LIVE_PATCHING + help + Say Y here if you want to support kernel live patching. + This option has no runtime impact until a kernel "patch" + module uses the interface provided by this option to register + a patch, causing calls to patched functions to be redirected + to new function code contained in the patch module. diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile new file mode 100644 index 000000000000..7c1f00861428 --- /dev/null +++ b/kernel/livepatch/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LIVE_PATCHING) += livepatch.o + +livepatch-objs := core.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c new file mode 100644 index 000000000000..f99fe189d596 --- /dev/null +++ b/kernel/livepatch/core.c @@ -0,0 +1,930 @@ +/* + * core.c - Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The klp_mutex protects the klp_patches list and state transitions of any + * structure reachable from the patches list. References to any structure must + * be obtained under mutex protection. + */ + +static DEFINE_MUTEX(klp_mutex); +static LIST_HEAD(klp_patches); + +static struct kobject *klp_root_kobj; + +static bool klp_is_module(struct klp_object *obj) +{ + return obj->name; +} + +static bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +/* sets obj->mod if object is not vmlinux and module is found */ +static void klp_find_object_module(struct klp_object *obj) +{ + if (!klp_is_module(obj)) + return; + + mutex_lock(&module_mutex); + /* + * We don't need to take a reference on the module here because we have + * the klp_mutex, which is also taken by the module notifier. This + * prevents any module from unloading until we release the klp_mutex. + */ + obj->mod = find_module(obj->name); + mutex_unlock(&module_mutex); +} + +/* klp_mutex must be held by caller */ +static bool klp_is_patch_registered(struct klp_patch *patch) +{ + struct klp_patch *mypatch; + + list_for_each_entry(mypatch, &klp_patches, list) + if (mypatch == patch) + return true; + + return false; +} + +static bool klp_initialized(void) +{ + return klp_root_kobj; +} + +struct klp_find_arg { + const char *objname; + const char *name; + unsigned long addr; + /* + * If count == 0, the symbol was not found. If count == 1, a unique + * match was found and addr is set. If count > 1, there is + * unresolvable ambiguity among "count" number of symbols with the same + * name in the same object. + */ + unsigned long count; +}; + +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + /* + * args->addr might be overwritten if another match is found + * but klp_find_object_symbol() handles this and only returns the + * addr if count == 1. + */ + args->addr = addr; + args->count++; + + return 0; +} + +static int klp_find_object_symbol(const char *objname, const char *name, + unsigned long *addr) +{ + struct klp_find_arg args = { + .objname = objname, + .name = name, + .addr = 0, + .count = 0 + }; + + kallsyms_on_each_symbol(klp_find_callback, &args); + + if (args.count == 0) + pr_err("symbol '%s' not found in symbol table\n", name); + else if (args.count > 1) + pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", + args.count, name, objname); + else { + *addr = args.addr; + return 0; + } + + *addr = 0; + return -EINVAL; +} + +struct klp_verify_args { + const char *name; + const unsigned long addr; +}; + +static int klp_verify_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_verify_args *args = data; + + if (!mod && + !strcmp(args->name, name) && + args->addr == addr) + return 1; + + return 0; +} + +static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) +{ + struct klp_verify_args args = { + .name = name, + .addr = addr, + }; + + if (kallsyms_on_each_symbol(klp_verify_callback, &args)) + return 0; + + pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?", + name, addr); + return -EINVAL; +} + +static int klp_find_verify_func_addr(struct klp_object *obj, + struct klp_func *func) +{ + int ret; + +#if defined(CONFIG_RANDOMIZE_BASE) + /* KASLR is enabled, disregard old_addr from user */ + func->old_addr = 0; +#endif + + if (!func->old_addr || klp_is_module(obj)) + ret = klp_find_object_symbol(obj->name, func->old_name, + &func->old_addr); + else + ret = klp_verify_vmlinux_symbol(func->old_name, + func->old_addr); + + return ret; +} + +/* + * external symbols are located outside the parent object (where the parent + * object is either vmlinux or the kmod being patched). + */ +static int klp_find_external_symbol(struct module *pmod, const char *name, + unsigned long *addr) +{ + const struct kernel_symbol *sym; + + /* first, check if it's an exported symbol */ + preempt_disable(); + sym = find_symbol(name, NULL, NULL, true, true); + preempt_enable(); + if (sym) { + *addr = sym->value; + return 0; + } + + /* otherwise check if it's in another .o within the patch module */ + return klp_find_object_symbol(pmod->name, name, addr); +} + +static int klp_write_object_relocations(struct module *pmod, + struct klp_object *obj) +{ + int ret; + struct klp_reloc *reloc; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + if (WARN_ON(!obj->relocs)) + return -EINVAL; + + for (reloc = obj->relocs; reloc->name; reloc++) { + if (!klp_is_module(obj)) { + ret = klp_verify_vmlinux_symbol(reloc->name, + reloc->val); + if (ret) + return ret; + } else { + /* module, reloc->val needs to be discovered */ + if (reloc->external) + ret = klp_find_external_symbol(pmod, + reloc->name, + &reloc->val); + else + ret = klp_find_object_symbol(obj->mod->name, + reloc->name, + &reloc->val); + if (ret) + return ret; + } + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, + reloc->val + reloc->addend); + if (ret) { + pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", + reloc->name, reloc->val, ret); + return ret; + } + } + + return 0; +} + +static void notrace klp_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *ops, + struct pt_regs *regs) +{ + struct klp_func *func = ops->private; + + regs->ip = (unsigned long)func->new_func; +} + +static int klp_disable_func(struct klp_func *func) +{ + int ret; + + if (WARN_ON(func->state != KLP_ENABLED)) + return -EINVAL; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + ret = unregister_ftrace_function(func->fops); + if (ret) { + pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + return ret; + } + + ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0); + if (ret) + pr_warn("function unregister succeeded but failed to clear the filter\n"); + + func->state = KLP_DISABLED; + + return 0; +} + +static int klp_enable_func(struct klp_func *func) +{ + int ret; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + if (WARN_ON(func->state != KLP_DISABLED)) + return -EINVAL; + + ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + return ret; + } + + ret = register_ftrace_function(func->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0); + } else { + func->state = KLP_ENABLED; + } + + return ret; +} + +static int klp_disable_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + for (func = obj->funcs; func->old_name; func++) { + if (func->state != KLP_ENABLED) + continue; + + ret = klp_disable_func(func); + if (ret) + return ret; + } + + obj->state = KLP_DISABLED; + + return 0; +} + +static int klp_enable_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (WARN_ON(obj->state != KLP_DISABLED)) + return -EINVAL; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_enable_func(func); + if (ret) + goto unregister; + } + obj->state = KLP_ENABLED; + + return 0; + +unregister: + WARN_ON(klp_disable_object(obj)); + return ret; +} + +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + pr_notice("disabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + if (obj->state != KLP_ENABLED) + continue; + + ret = klp_disable_object(obj); + if (ret) + return ret; + } + + patch->state = KLP_DISABLED; + + return 0; +} + +/** + * klp_disable_patch() - disables a registered patch + * @patch: The registered, enabled patch to be disabled + * + * Unregisters the patched functions from ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_disable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + if (patch->state == KLP_DISABLED) { + ret = -EINVAL; + goto err; + } + + ret = __klp_disable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_disable_patch); + +static int __klp_enable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (WARN_ON(patch->state != KLP_DISABLED)) + return -EINVAL; + + pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); + add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + + pr_notice("enabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + klp_find_object_module(obj); + + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_enable_object(obj); + if (ret) + goto unregister; + } + + patch->state = KLP_ENABLED; + + return 0; + +unregister: + WARN_ON(__klp_disable_patch(patch)); + return ret; +} + +/** + * klp_enable_patch() - enables a registered patch + * @patch: The registered, disabled patch to be enabled + * + * Performs the needed symbol lookups and code relocations, + * then registers the patched functions with ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_enable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + ret = __klp_enable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * Sysfs Interface + * + * /sys/kernel/livepatch + * /sys/kernel/livepatch/ + * /sys/kernel/livepatch//enabled + * /sys/kernel/livepatch// + * /sys/kernel/livepatch/// + */ + +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + unsigned long val; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return -EINVAL; + + if (val != KLP_DISABLED && val != KLP_ENABLED) + return -EINVAL; + + patch = container_of(kobj, struct klp_patch, kobj); + + mutex_lock(&klp_mutex); + + if (val == patch->state) { + /* already in requested state */ + ret = -EINVAL; + goto err; + } + + if (val == KLP_ENABLED) { + ret = __klp_enable_patch(patch); + if (ret) + goto err; + } else { + ret = __klp_disable_patch(patch); + if (ret) + goto err; + } + + mutex_unlock(&klp_mutex); + + return count; + +err: + mutex_unlock(&klp_mutex); + return ret; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); +} + +static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); +static struct attribute *klp_patch_attrs[] = { + &enabled_kobj_attr.attr, + NULL +}; + +static void klp_kobj_release_patch(struct kobject *kobj) +{ + /* + * Once we have a consistency model we'll need to module_put() the + * patch module here. See klp_register_patch() for more details. + */ +} + +static struct kobj_type klp_ktype_patch = { + .release = klp_kobj_release_patch, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = klp_patch_attrs, +}; + +static void klp_kobj_release_func(struct kobject *kobj) +{ + struct klp_func *func; + + func = container_of(kobj, struct klp_func, kobj); + kfree(func->fops); +} + +static struct kobj_type klp_ktype_func = { + .release = klp_kobj_release_func, + .sysfs_ops = &kobj_sysfs_ops, +}; + +/* + * Free all functions' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_funcs_limited(struct klp_object *obj, + struct klp_func *limit) +{ + struct klp_func *func; + + for (func = obj->funcs; func->old_name && func != limit; func++) + kobject_put(&func->kobj); +} + +/* Clean up when a patched object is unloaded */ +static void klp_free_object_loaded(struct klp_object *obj) +{ + struct klp_func *func; + + obj->mod = NULL; + + for (func = obj->funcs; func->old_name; func++) + func->old_addr = 0; +} + +/* + * Free all objects' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_objects_limited(struct klp_patch *patch, + struct klp_object *limit) +{ + struct klp_object *obj; + + for (obj = patch->objs; obj->funcs && obj != limit; obj++) { + klp_free_funcs_limited(obj, NULL); + kobject_put(obj->kobj); + } +} + +static void klp_free_patch(struct klp_patch *patch) +{ + klp_free_objects_limited(patch, NULL); + if (!list_empty(&patch->list)) + list_del(&patch->list); + kobject_put(&patch->kobj); +} + +static int klp_init_func(struct klp_object *obj, struct klp_func *func) +{ + struct ftrace_ops *ops; + int ret; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->private = func; + ops->func = klp_ftrace_handler; + ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC; + func->fops = ops; + func->state = KLP_DISABLED; + + ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, + obj->kobj, func->old_name); + if (ret) { + kfree(func->fops); + return ret; + } + + return 0; +} + +/* parts of the initialization that is done only when the object is loaded */ +static int klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (obj->relocs) { + ret = klp_write_object_relocations(patch->mod, obj); + if (ret) + return ret; + } + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_find_verify_func_addr(obj, func); + if (ret) + return ret; + } + + return 0; +} + +static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) +{ + struct klp_func *func; + int ret; + const char *name; + + if (!obj->funcs) + return -EINVAL; + + obj->state = KLP_DISABLED; + + klp_find_object_module(obj); + + name = klp_is_module(obj) ? obj->name : "vmlinux"; + obj->kobj = kobject_create_and_add(name, &patch->kobj); + if (!obj->kobj) + return -ENOMEM; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_init_func(obj, func); + if (ret) + goto free; + } + + if (klp_is_object_loaded(obj)) { + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto free; + } + + return 0; + +free: + klp_free_funcs_limited(obj, func); + kobject_put(obj->kobj); + return ret; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (!patch->objs) + return -EINVAL; + + mutex_lock(&klp_mutex); + + patch->state = KLP_DISABLED; + + ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, + klp_root_kobj, patch->mod->name); + if (ret) + goto unlock; + + for (obj = patch->objs; obj->funcs; obj++) { + ret = klp_init_object(patch, obj); + if (ret) + goto free; + } + + list_add(&patch->list, &klp_patches); + + mutex_unlock(&klp_mutex); + + return 0; + +free: + klp_free_objects_limited(patch, obj); + kobject_put(&patch->kobj); +unlock: + mutex_unlock(&klp_mutex); + return ret; +} + +/** + * klp_unregister_patch() - unregisters a patch + * @patch: Disabled patch to be unregistered + * + * Frees the data structures and removes the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_unregister_patch(struct klp_patch *patch) +{ + int ret = 0; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto out; + } + + if (patch->state == KLP_ENABLED) { + ret = -EBUSY; + goto out; + } + + klp_free_patch(patch); + +out: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_unregister_patch); + +/** + * klp_register_patch() - registers a patch + * @patch: Patch to be registered + * + * Initializes the data structure associated with the patch and + * creates the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_register_patch(struct klp_patch *patch) +{ + int ret; + + if (!klp_initialized()) + return -ENODEV; + + if (!patch || !patch->mod) + return -EINVAL; + + /* + * A reference is taken on the patch module to prevent it from being + * unloaded. Right now, we don't allow patch modules to unload since + * there is currently no method to determine if a thread is still + * running in the patched code contained in the patch module once + * the ftrace registration is successful. + */ + if (!try_module_get(patch->mod)) + return -ENODEV; + + ret = klp_init_patch(patch); + if (ret) + module_put(patch->mod); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_register_patch); + +static void klp_module_notify_coming(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + int ret; + + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto err; + + if (patch->state == KLP_DISABLED) + return; + + pr_notice("applying patch '%s' to loading module '%s'\n", + pmod->name, mod->name); + + ret = klp_enable_object(obj); + if (!ret) + return; + +err: + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + pmod->name, mod->name, ret); +} + +static void klp_module_notify_going(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + int ret; + + if (patch->state == KLP_DISABLED) + goto disabled; + + pr_notice("reverting patch '%s' on unloading module '%s'\n", + pmod->name, mod->name); + + ret = klp_disable_object(obj); + if (ret) + pr_warn("failed to revert patch '%s' on module '%s' (%d)\n", + pmod->name, mod->name, ret); + +disabled: + klp_free_object_loaded(obj); +} + +static int klp_module_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct module *mod = data; + struct klp_patch *patch; + struct klp_object *obj; + + if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) + return 0; + + mutex_lock(&klp_mutex); + + list_for_each_entry(patch, &klp_patches, list) { + for (obj = patch->objs; obj->funcs; obj++) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; + + if (action == MODULE_STATE_COMING) { + obj->mod = mod; + klp_module_notify_coming(patch, obj); + } else /* MODULE_STATE_GOING */ + klp_module_notify_going(patch, obj); + + break; + } + } + + mutex_unlock(&klp_mutex); + + return 0; +} + +static struct notifier_block klp_module_nb = { + .notifier_call = klp_module_notify, + .priority = INT_MIN+1, /* called late but before ftrace notifier */ +}; + +static int klp_init(void) +{ + int ret; + + ret = register_module_notifier(&klp_module_nb); + if (ret) + return ret; + + klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); + if (!klp_root_kobj) { + ret = -ENOMEM; + goto unregister; + } + + return 0; + +unregister: + unregister_module_notifier(&klp_module_nb); + return ret; +} + +module_init(klp_init); -- cgit v1.2.3 From b5bfc51707f1b56b0b733980bb4fcc0562bf02d8 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 19 Dec 2014 14:11:17 +0800 Subject: livepatch: move x86 specific ftrace handler code to arch/x86 The execution flow redirection related implemention in the livepatch ftrace handler is depended on the specific architecture. This patch introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change the pt_regs. Signed-off-by: Li Bin Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 5 +++++ kernel/livepatch/core.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index d529db1b1edf..b5608d7757fd 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -22,6 +22,7 @@ #define _ASM_X86_LIVEPATCH_H #include +#include #ifdef CONFIG_LIVE_PATCHING #ifndef CC_USING_FENTRY @@ -30,6 +31,10 @@ extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} #else #error Live patching support is disabled; check CONFIG_LIVE_PATCHING #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f99fe189d596..07a2db9d01e6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -272,7 +272,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, { struct klp_func *func = ops->private; - regs->ip = (unsigned long)func->new_func; + klp_arch_set_pc(regs, (unsigned long)func->new_func); } static int klp_disable_func(struct klp_func *func) -- cgit v1.2.3 From 33e8612f64d4973ae3617a01224ec02b7b879597 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Dec 2014 13:39:54 +0100 Subject: livepatch: use FTRACE_OPS_FL_IPMODIFY Use the FTRACE_OPS_FL_IPMODIFY flag to prevent conflicts with other ftrace users who also modify regs->ip. Signed-off-by: Josh Poimboeuf Reviewed-by: Petr Mladek Acked-by: Masami Hiramatsu Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 07a2db9d01e6..6f6387912da7 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -641,7 +641,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) ops->private = func; ops->func = klp_ftrace_handler; - ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC; + ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; func->fops = ops; func->state = KLP_DISABLED; -- cgit v1.2.3 From cf6ab6d9143b157786bf29bca5c32e55234bb07d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 15 Dec 2014 20:13:31 -0500 Subject: tracing: Add ref count to tracer for when they are being read by pipe When one of the trace pipe files are being read (by either the trace_pipe or trace_pipe_raw), do not allow the current_trace to change. By adding a ref count that is incremented when the pipe files are opened, will prevent the current_trace from being changed. This will allow for the removal of the global trace_types_lock from reading the pipe buffers (which is currently a bottle neck). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 +++++++++++++++- kernel/trace/trace.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e767972e99c..ed3fba1d6570 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4140,6 +4140,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) goto out; } + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->current_trace->ref) { + ret = -EBUSY; + goto out; + } + trace_branch_disable(); tr->current_trace->enabled--; @@ -4363,6 +4369,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); + + tr->current_trace->ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -4382,6 +4390,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + tr->current_trace->ref--; + if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -5331,6 +5341,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; + tr->current_trace->ref++; + mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); @@ -5437,6 +5449,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + iter->tr->current_trace->ref--; + __trace_array_put(iter->tr); if (info->spare) @@ -6416,7 +6430,7 @@ static int instance_delete(const char *name) goto out_unlock; ret = -EBUSY; - if (tr->ref) + if (tr->ref || (tr->current_trace && tr->current_trace->ref)) goto out_unlock; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8de48bac1ce2..0eddfeb05fee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -388,6 +388,7 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; + int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.2.3 From 574732c73d155320f9358d9ee5d84beb0f4ecee2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 23 Dec 2014 15:05:36 +1030 Subject: param: initialize store function to NULL if not available. I rebased Kees' 'param: do not set store func without write perm' on top of my 'params: cleanup sysfs allocation'. However, my patch uses krealloc which doesn't zero memory, leaving .store unset. Reported-by: Sasha Levin Cc: Kees Cook Signed-off-by: Rusty Russell --- kernel/params.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 0af9b2c4e56c..bd65d136a470 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -648,6 +648,8 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + else + mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; -- cgit v1.2.3 From d716ff71dd12bc6328f84a9ec1c3647daf01c827 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 15 Dec 2014 22:31:07 -0500 Subject: tracing: Remove taking of trace_types_lock in pipe files Taking the global mutex "trace_types_lock" in the trace_pipe files causes a bottle neck as most the pipe files can be read per cpu and there's no reason to serialize them. The current_trace variable was given a ref count and it can not change when the ref count is not zero. Opening the trace_pipe files will up the ref count (and decremented on close), so that the lock no longer needs to be taken when accessing the current_trace variable. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 110 +++++++++++++-------------------------------------- 1 file changed, 28 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed3fba1d6570..7669b1f3234e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4332,17 +4332,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) } trace_seq_init(&iter->seq); - - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - ret = -ENOMEM; - goto fail; - } - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -4399,7 +4389,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) free_cpumask_var(iter->started); mutex_destroy(&iter->mutex); - kfree(iter->trace); kfree(iter); trace_array_put(tr); @@ -4432,7 +4421,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* Must be called with trace_types_lock mutex held. */ +/* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; @@ -4477,7 +4466,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -4487,12 +4475,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_seq_init(&iter->seq); - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself @@ -4652,7 +4634,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -4660,12 +4641,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - mutex_lock(&iter->mutex); if (iter->trace->splice_read) { @@ -5373,21 +5348,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - size = -EBUSY; - goto out_unlock; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif if (!info->spare) info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - size = -ENOMEM; if (!info->spare) - goto out_unlock; + return -ENOMEM; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -5403,21 +5373,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) { - size = -EAGAIN; - goto out_unlock; - } - mutex_unlock(&trace_types_lock); + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + ret = wait_on_pipe(iter, false); - mutex_lock(&trace_types_lock); - if (ret) { - size = ret; - goto out_unlock; - } + if (ret) + return ret; + goto again; } - size = 0; - goto out_unlock; + return 0; } info->read = 0; @@ -5427,18 +5392,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) { - size = -EFAULT; - goto out_unlock; - } + if (ret == size) + return -EFAULT; + size -= ret; *ppos += size; info->read += size; - out_unlock: - mutex_unlock(&trace_types_lock); - return size; } @@ -5536,30 +5497,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, size, i; ssize_t ret = 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) { - ret = -ENOMEM; - goto out; - } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) { - ret = -EINVAL; - goto out; - } + if (*ppos & (PAGE_SIZE - 1)) + return -EINVAL; if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) { - ret = -EINVAL; - goto out; - } + if (len < PAGE_SIZE) + return -EINVAL; len &= PAGE_MASK; } @@ -5620,25 +5571,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - goto out; + return ret; + + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + return -EAGAIN; - if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { - ret = -EAGAIN; - goto out; - } - mutex_unlock(&trace_types_lock); ret = wait_on_pipe(iter, true); - mutex_lock(&trace_types_lock); if (ret) - goto out; + return ret; goto again; } ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); -out: - mutex_unlock(&trace_types_lock); return ret; } -- cgit v1.2.3 From 74d23cc704d19732e70ef1579a669f7d5f09dd9a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Sun, 21 Dec 2014 19:46:56 +0100 Subject: time: move the timecounter/cyclecounter code into its own file. The timecounter code has almost nothing to do with the clocksource code. Let it live in its own file. This will help isolate the timecounter users from the clocksource users in the source tree. Signed-off-by: Richard Cochran Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe.h | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 2 +- drivers/net/ethernet/freescale/fec.h | 1 + drivers/net/ethernet/intel/e1000e/e1000.h | 2 +- drivers/net/ethernet/intel/igb/igb.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 +- drivers/net/ethernet/ti/cpts.h | 1 + include/clocksource/arm_arch_timer.h | 2 +- include/linux/clocksource.h | 102 ----------------------- include/linux/mlx4/device.h | 2 +- include/linux/timecounter.h | 122 ++++++++++++++++++++++++++++ include/linux/types.h | 3 + kernel/time/Makefile | 2 +- kernel/time/clocksource.c | 76 ----------------- kernel/time/timecounter.c | 95 ++++++++++++++++++++++ sound/pci/hda/hda_priv.h | 2 +- 16 files changed, 231 insertions(+), 187 deletions(-) create mode 100644 include/linux/timecounter.h create mode 100644 kernel/time/timecounter.c (limited to 'kernel') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index f9ec762ac3f0..2af6affc35a7 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -124,7 +124,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index c3a6072134f5..792ba72fb5c8 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* compilation time flags */ diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 469691ad4a1e..df8bbddaeb37 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -16,6 +16,7 @@ #include #include #include +#include #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \ defined(CONFIG_M520x) || defined(CONFIG_M532x) || \ diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 7785240a0da1..9416e5a7e0c8 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 82d891e183b1..ee22da391474 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -29,7 +29,7 @@ #include "e1000_mac.h" #include "e1000_82575.h" -#include +#include #include #include #include diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index b6137be43920..38fc64cf5dca 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 1a581ef7eee8..69a46b92c7d6 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -27,6 +27,7 @@ #include #include #include +#include struct cpsw_cpts { u32 idver; /* Identification and version */ diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 6d26b40cbf5d..9916d0e4eff5 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -16,7 +16,7 @@ #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H #define __CLKSOURCE_ARM_ARCH_TIMER_H -#include +#include #include #define ARCH_TIMER_CTRL_ENABLE (1 << 0) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index abcafaa20b86..9c78d15d33e4 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -18,8 +18,6 @@ #include #include -/* clocksource cycle base type */ -typedef u64 cycle_t; struct clocksource; struct module; @@ -27,106 +25,6 @@ struct module; #include #endif -/** - * struct cyclecounter - hardware abstraction for a free running counter - * Provides completely state-free accessors to the underlying hardware. - * Depending on which hardware it reads, the cycle counter may wrap - * around quickly. Locking rules (if necessary) have to be defined - * by the implementor and user of specific instances of this API. - * - * @read: returns the current cycle value - * @mask: bitmask for two's complement - * subtraction of non 64 bit counters, - * see CLOCKSOURCE_MASK() helper macro - * @mult: cycle to nanosecond multiplier - * @shift: cycle to nanosecond divisor (power of two) - */ -struct cyclecounter { - cycle_t (*read)(const struct cyclecounter *cc); - cycle_t mask; - u32 mult; - u32 shift; -}; - -/** - * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds - * Contains the state needed by timecounter_read() to detect - * cycle counter wrap around. Initialize with - * timecounter_init(). Also used to convert cycle counts into the - * corresponding nanosecond counts with timecounter_cyc2time(). Users - * of this code are responsible for initializing the underlying - * cycle counter hardware, locking issues and reading the time - * more often than the cycle counter wraps around. The nanosecond - * counter will only wrap around after ~585 years. - * - * @cc: the cycle counter used by this instance - * @cycle_last: most recent cycle counter value seen by - * timecounter_read() - * @nsec: continuously increasing count - */ -struct timecounter { - const struct cyclecounter *cc; - cycle_t cycle_last; - u64 nsec; -}; - -/** - * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds - * @cc: Pointer to cycle counter. - * @cycles: Cycles - * - * XXX - This could use some mult_lxl_ll() asm optimization. Same code - * as in cyc2ns, but with unsigned result. - */ -static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, - cycle_t cycles) -{ - u64 ret = (u64)cycles; - ret = (ret * cc->mult) >> cc->shift; - return ret; -} - -/** - * timecounter_init - initialize a time counter - * @tc: Pointer to time counter which is to be initialized/reset - * @cc: A cycle counter, ready to be used. - * @start_tstamp: Arbitrary initial time stamp. - * - * After this call the current cycle register (roughly) corresponds to - * the initial time stamp. Every call to timecounter_read() increments - * the time stamp counter by the number of elapsed nanoseconds. - */ -extern void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp); - -/** - * timecounter_read - return nanoseconds elapsed since timecounter_init() - * plus the initial time stamp - * @tc: Pointer to time counter. - * - * In other words, keeps track of time since the same epoch as - * the function which generated the initial time stamp. - */ -extern u64 timecounter_read(struct timecounter *tc); - -/** - * timecounter_cyc2time - convert a cycle counter to same - * time base as values returned by - * timecounter_read() - * @tc: Pointer to time counter. - * @cycle_tstamp: a value returned by tc->cc->read() - * - * Cycle counts that are converted correctly as long as they - * fall into the interval [-1/2 max cycle count, +1/2 max cycle count], - * with "max cycle count" == cs->mask+1. - * - * This allows conversion of cycle counter values which were generated - * in the past. - */ -extern u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp); - /** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 25c791e295fd..f1e41b33462f 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -42,7 +42,7 @@ #include -#include +#include #define MAX_MSIX_P_PORT 17 #define MAX_MSIX 64 diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h new file mode 100644 index 000000000000..146f07a6651b --- /dev/null +++ b/include/linux/timecounter.h @@ -0,0 +1,122 @@ +/* + * linux/include/linux/timecounter.h + * + * based on code that migrated away from + * linux/include/linux/clocksource.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef _LINUX_TIMECOUNTER_H +#define _LINUX_TIMECOUNTER_H + +#include + +/** + * struct cyclecounter - hardware abstraction for a free running counter + * Provides completely state-free accessors to the underlying hardware. + * Depending on which hardware it reads, the cycle counter may wrap + * around quickly. Locking rules (if necessary) have to be defined + * by the implementor and user of specific instances of this API. + * + * @read: returns the current cycle value + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters, + * see CLOCKSOURCE_MASK() helper macro + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + */ +struct cyclecounter { + cycle_t (*read)(const struct cyclecounter *cc); + cycle_t mask; + u32 mult; + u32 shift; +}; + +/** + * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds + * Contains the state needed by timecounter_read() to detect + * cycle counter wrap around. Initialize with + * timecounter_init(). Also used to convert cycle counts into the + * corresponding nanosecond counts with timecounter_cyc2time(). Users + * of this code are responsible for initializing the underlying + * cycle counter hardware, locking issues and reading the time + * more often than the cycle counter wraps around. The nanosecond + * counter will only wrap around after ~585 years. + * + * @cc: the cycle counter used by this instance + * @cycle_last: most recent cycle counter value seen by + * timecounter_read() + * @nsec: continuously increasing count + */ +struct timecounter { + const struct cyclecounter *cc; + cycle_t cycle_last; + u64 nsec; +}; + +/** + * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds + * @cc: Pointer to cycle counter. + * @cycles: Cycles + * + * XXX - This could use some mult_lxl_ll() asm optimization. Same code + * as in cyc2ns, but with unsigned result. + */ +static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, + cycle_t cycles) +{ + u64 ret = (u64)cycles; + ret = (ret * cc->mult) >> cc->shift; + return ret; +} + +/** + * timecounter_init - initialize a time counter + * @tc: Pointer to time counter which is to be initialized/reset + * @cc: A cycle counter, ready to be used. + * @start_tstamp: Arbitrary initial time stamp. + * + * After this call the current cycle register (roughly) corresponds to + * the initial time stamp. Every call to timecounter_read() increments + * the time stamp counter by the number of elapsed nanoseconds. + */ +extern void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp); + +/** + * timecounter_read - return nanoseconds elapsed since timecounter_init() + * plus the initial time stamp + * @tc: Pointer to time counter. + * + * In other words, keeps track of time since the same epoch as + * the function which generated the initial time stamp. + */ +extern u64 timecounter_read(struct timecounter *tc); + +/** + * timecounter_cyc2time - convert a cycle counter to same + * time base as values returned by + * timecounter_read() + * @tc: Pointer to time counter. + * @cycle_tstamp: a value returned by tc->cc->read() + * + * Cycle counts that are converted correctly as long as they + * fall into the interval [-1/2 max cycle count, +1/2 max cycle count], + * with "max cycle count" == cs->mask+1. + * + * This allows conversion of cycle counter values which were generated + * in the past. + */ +extern u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp); + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index a0bb7048687f..62323825cff9 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -213,5 +213,8 @@ struct callback_head { }; #define rcu_head callback_head +/* clocksource cycle base type */ +typedef u64 cycle_t; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f622cf28628a..c09c07817d7a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,6 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b79f39bda7e1..4892352f0e49 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -34,82 +34,6 @@ #include "tick-internal.h" #include "timekeeping_internal.h" -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); - -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) -{ - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); - /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks * @mult: pointer to mult variable diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000000..59a1ec3a57cb --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,95 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (cycle_delta > tc->cc->mask / 2) { + cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); + } else { + nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/sound/pci/hda/hda_priv.h b/sound/pci/hda/hda_priv.h index 166e3e84b963..daf458299753 100644 --- a/sound/pci/hda/hda_priv.h +++ b/sound/pci/hda/hda_priv.h @@ -15,7 +15,7 @@ #ifndef __SOUND_HDA_PRIV_H #define __SOUND_HDA_PRIV_H -#include +#include #include #include -- cgit v1.2.3 From 2eebdde6528a722fbf8e2cffcf7aa52cbb4c2de0 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Sun, 21 Dec 2014 19:47:06 +0100 Subject: timecounter: keep track of accumulated fractional nanoseconds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current timecounter implementation will drop a variable amount of resolution, depending on the magnitude of the time delta. In other words, reading the clock too often or too close to a time stamp conversion will introduce errors into the time values. This patch fixes the issue by introducing a fractional nanosecond field that accumulates the low order bits. Reported-by: Janusz Użycki Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_clock.c | 4 ++-- include/linux/timecounter.h | 19 ++++++++++------ kernel/time/timecounter.c | 31 +++++++++++++++++++++------ virt/kvm/arm/arch_timer.c | 3 ++- 4 files changed, 40 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c index df35d0e1b899..e9cce4f72b24 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -240,7 +240,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) { struct mlx4_dev *dev = mdev->dev; unsigned long flags; - u64 ns; + u64 ns, zero = 0; rwlock_init(&mdev->clock_lock); @@ -265,7 +265,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) /* Calculate period in seconds to call the overflow watchdog - to make * sure counter is checked at least once every wrap around. */ - ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask); + ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero); do_div(ns, NSEC_PER_SEC / 2 / HZ); mdev->overflow_period = ns; diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index af3dfa4e90f0..74f45496e6d1 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -55,27 +55,32 @@ struct cyclecounter { * @cycle_last: most recent cycle counter value seen by * timecounter_read() * @nsec: continuously increasing count + * @mask: bit mask for maintaining the 'frac' field + * @frac: accumulated fractional nanoseconds */ struct timecounter { const struct cyclecounter *cc; cycle_t cycle_last; u64 nsec; + u64 mask; + u64 frac; }; /** * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds * @cc: Pointer to cycle counter. * @cycles: Cycles - * - * XXX - This could use some mult_lxl_ll() asm optimization. Same code - * as in cyc2ns, but with unsigned result. + * @mask: bit mask for maintaining the 'frac' field + * @frac: pointer to storage for the fractional nanoseconds. */ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, - cycle_t cycles) + cycle_t cycles, u64 mask, u64 *frac) { - u64 ret = (u64)cycles; - ret = (ret * cc->mult) >> cc->shift; - return ret; + u64 ns = (u64) cycles; + + ns = (ns * cc->mult) + *frac; + *frac = ns & mask; + return ns >> cc->shift; } /** diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 59a1ec3a57cb..4687b3104bae 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -25,6 +25,8 @@ void timecounter_init(struct timecounter *tc, tc->cc = cc; tc->cycle_last = cc->read(cc); tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; } EXPORT_SYMBOL_GPL(timecounter_init); @@ -51,7 +53,8 @@ static u64 timecounter_read_delta(struct timecounter *tc) cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); /* update time stamp of timecounter_read_delta() call: */ tc->cycle_last = cycle_now; @@ -72,22 +75,36 @@ u64 timecounter_read(struct timecounter *tc) } EXPORT_SYMBOL_GPL(timecounter_read); +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + cycle_t cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + u64 timecounter_cyc2time(struct timecounter *tc, cycle_t cycle_tstamp) { - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; /* * Instead of always treating cycle_tstamp as more recent * than tc->cycle_last, detect when it is too far in the * future and treat it as old time stamp instead. */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); } return nsec; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 1c0772b340d8..6e54f3542126 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -152,7 +152,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) return; } - ns = cyclecounter_cyc2ns(timecounter->cc, cval - now); + ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask, + &timecounter->frac); timer_arm(timer, ns); } -- cgit v1.2.3 From 734d16801349fbe951d2f780191d32c5b8a892d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 14:45:12 -0800 Subject: rcu: Make rcu_nmi_enter() handle nesting The x86 architecture has multiple types of NMI-like interrupts: real NMIs, machine checks, and, for some values of NMI-like, debugging and breakpoint interrupts. These interrupts can nest inside each other. Andy Lutomirski is adding RCU support to these interrupts, so rcu_nmi_enter() and rcu_nmi_exit() must now correctly handle nesting. This commit therefore introduces nesting, using a clever NMI-coordination algorithm suggested by Andy. The trick is to atomically increment ->dynticks (if needed) before manipulating ->dynticks_nmi_nesting on entry (and, accordingly, after on exit). In addition, ->dynticks_nmi_nesting is incremented by one if ->dynticks was incremented and by two otherwise. This means that when rcu_nmi_exit() sees ->dynticks_nmi_nesting equal to one, it knows that ->dynticks must be atomically incremented. This NMI-coordination algorithms has been validated by the following Promela model: ------------------------------------------------------------------------ /* * Promela model for Andy Lutomirski's suggested change to rcu_nmi_enter() * that allows nesting. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, you can access it online at * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2014 * * Author: Paul E. McKenney */ byte dynticks_nmi_nesting = 0; byte dynticks = 0; /* * Promela verision of rcu_nmi_enter(). */ inline rcu_nmi_enter() { byte incby; byte tmp; incby = BUSY_INCBY; assert(dynticks_nmi_nesting >= 0); if :: (dynticks & 1) == 0 -> atomic { dynticks = dynticks + 1; } assert((dynticks & 1) == 1); incby = 1; :: else -> skip; fi; tmp = dynticks_nmi_nesting; tmp = tmp + incby; dynticks_nmi_nesting = tmp; assert(dynticks_nmi_nesting >= 1); } /* * Promela verision of rcu_nmi_exit(). */ inline rcu_nmi_exit() { byte tmp; assert(dynticks_nmi_nesting > 0); assert((dynticks & 1) != 0); if :: dynticks_nmi_nesting != 1 -> tmp = dynticks_nmi_nesting; tmp = tmp - BUSY_INCBY; dynticks_nmi_nesting = tmp; :: else -> dynticks_nmi_nesting = 0; atomic { dynticks = dynticks + 1; } assert((dynticks & 1) == 0); fi; } /* * Base-level NMI runs non-atomically. Crudely emulates process-level * dynticks-idle entry/exit. */ proctype base_NMI() { byte busy; busy = 0; do :: /* Emulate base-level dynticks and not. */ if :: 1 -> atomic { dynticks = dynticks + 1; } busy = 1; :: 1 -> skip; fi; /* Verify that we only sometimes have base-level dynticks. */ if :: busy == 0 -> skip; :: busy == 1 -> skip; fi; /* Model RCU's NMI entry and exit actions. */ rcu_nmi_enter(); assert((dynticks & 1) == 1); rcu_nmi_exit(); /* Emulated re-entering base-level dynticks and not. */ if :: !busy -> skip; :: busy -> atomic { dynticks = dynticks + 1; } busy = 0; fi; /* We had better now be in dyntick-idle mode. */ assert((dynticks & 1) == 0); od; } /* * Nested NMI runs atomically to emulate interrupting base_level(). */ proctype nested_NMI() { do :: /* * Use an atomic section to model a nested NMI. This is * guaranteed to interleave into base_NMI() between a pair * of base_NMI() statements, just as a nested NMI would. */ atomic { /* Verify that we only sometimes are in dynticks. */ if :: (dynticks & 1) == 0 -> skip; :: (dynticks & 1) == 1 -> skip; fi; /* Model RCU's NMI entry and exit actions. */ rcu_nmi_enter(); assert((dynticks & 1) == 1); rcu_nmi_exit(); } od; } init { run base_NMI(); run nested_NMI(); } ------------------------------------------------------------------------ The following script can be used to run this model if placed in rcu_nmi.spin: ------------------------------------------------------------------------ if ! spin -a rcu_nmi.spin then echo Spin errors!!! exit 1 fi if ! cc -DSAFETY -o pan pan.c then echo Compilation errors!!! exit 1 fi ./pan -m100000 Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7680fc275036..4c106fcc0d54 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -759,39 +759,71 @@ void rcu_irq_enter(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) */ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + smp_mb__before_atomic(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* atomic_inc() before later RCU read-side crit sects */ + smp_mb__after_atomic(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + incby = 1; + } + rdtp->dynticks_nmi_nesting += incby; + barrier(); } /** * rcu_nmi_exit - inform RCU of exit from NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. */ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + rdtp->dynticks_nmi_nesting -= 2; return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + rdtp->dynticks_nmi_nesting = 0; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); -- cgit v1.2.3 From ca9558a33f658155c3b69f92897c2e6a848684f5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 31 Oct 2014 14:55:05 +0000 Subject: rcu: Remove redundant rcu_is_cpu_rrupt_from_idle() from tiny RCU Let's start assuming that something in the idle loop posts a callback, and scheduling-clock interrupt occurs: 1. The system is idle and stays that way, no runnable tasks. 2. Scheduling-clock interrupt occurs, rcu_check_callbacks() is called as result, which in turn calls rcu_is_cpu_rrupt_from_idle(). 3. rcu_is_cpu_rrupt_from_idle() reports the CPU was interrupted from idle, which results in rcu_sched_qs() call, which does a raise_softirq(RCU_SOFTIRQ). 4. Upon return from interrupt, rcu_irq_exit() is invoked, which calls rcu_idle_enter_common(), which in turn calls rcu_sched_qs() again, which does another raise_softirq(RCU_SOFTIRQ). 5. The softirq happens shortly and invokes rcu_process_callbacks(), which invokes __rcu_process_callbacks(). 6. So now callbacks can be invoked. At least they can be if ->donetail has been updated. Which it will have been because rcu_sched_qs() invokes rcu_qsctr_help(). In the described scenario rcu_sched_qs() and raise_softirq(RCU_SOFTIRQ) get called twice in steps 3 and 4. This redundancy could be eliminated by removing rcu_is_cpu_rrupt_from_idle() function. Signed-off-by: Alexander Gordeev Cc: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649f8817..805b6d59c854 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -185,16 +185,6 @@ EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ -/* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers @@ -250,7 +240,7 @@ void rcu_bh_qs(void) void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) + if (user) rcu_sched_qs(); else if (!in_softirq()) rcu_bh_qs(); -- cgit v1.2.3 From 924df8a0117aa2725750f1ec4d282867534d9a89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Oct 2014 15:37:58 -0700 Subject: rcu: Fix invoke_rcu_callbacks() comment Despite what the comment says, it is only softirqs that are disabled, not interrupts. This commit therefore fixes the comment. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..9c25b99e2c33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2601,7 +2601,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) * Schedule RCU callback invocation. If the specified type of RCU * does not support RCU priority boosting, just do a direct call, * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the + * are running on the current CPU with softirqs disabled, the * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) -- cgit v1.2.3 From 113948d841e8d78039e5dbbb5248f5b73e99eafa Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:19 +0100 Subject: spinlock: Add spin_lock_bh_nested() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/spinlock.h | 8 ++++++++ include/linux/spinlock_api_smp.h | 2 ++ include/linux/spinlock_api_up.h | 1 + kernel/locking/spinlock.c | 8 ++++++++ 4 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 262ba4ef9a8e..3e18379dfa6f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -190,6 +190,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) +# define raw_spin_lock_bh_nested(lock, subclass) \ + _raw_spin_lock_bh_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ @@ -205,6 +207,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) +# define raw_spin_lock_bh_nested(lock, subclass) _raw_spin_lock_bh(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -324,6 +327,11 @@ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) +#define spin_lock_bh_nested(lock, subclass) \ +do { \ + raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\ +} while (0) + #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 42dfab89e740..5344268e6e62 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr); void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) __acquires(lock); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) + __acquires(lock); void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map) __acquires(lock); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..d3afef9d8dbe 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -57,6 +57,7 @@ #define _raw_spin_lock(lock) __LOCK(lock) #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) +#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..db3ccb1dd614 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh_nested); + unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { -- cgit v1.2.3 From 5f6130fa52ee0df0b1da4518c5bbef42bcfe7d83 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 9 Dec 2014 17:53:34 +0800 Subject: tiny_rcu: Directly force QS when call_rcu_[bh|sched]() on idle_task For RCU in UP, context-switch = QS = GP, thus we can force a context-switch when any call_rcu_[bh|sched]() is happened on idle_task. After doing so, rcu_idle/irq_enter/exit() are useless, so we can simply make these functions empty. More important, this change does not change the functionality logically. Note: raise_softirq(RCU_SOFTIRQ)/rcu_sched_qs() in rcu_idle_enter() and outmost rcu_irq_exit() will have to wake up the ksoftirqd (due to in_interrupt() == 0). Before this patch After this patch: call_rcu_sched() in idle; call_rcu_sched() in idle set resched do other stuffs; do other stuffs outmost rcu_irq_exit() outmost rcu_irq_exit() (empty function) (or rcu_idle_enter()) (or rcu_idle_enter(), also empty function) start to resched. (see above) rcu_sched_qs() rcu_sched_qs() QS,and GP and advance cb QS,and GP and advance cb wake up the ksoftirqd wake up the ksoftirqd set resched resched to ksoftirqd (or other) resched to ksoftirqd (or other) These two code patches are almost the same. Size changed after patched: size kernel/rcu/tiny-old.o kernel/rcu/tiny-patched.o text data bss dec hex filename 3449 206 8 3663 e4f kernel/rcu/tiny-old.o 2406 144 8 2558 9fe kernel/rcu/tiny-patched.o Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 +++ kernel/rcu/tiny.c | 99 ++++-------------------------------------------- kernel/rcu/tiny_plugin.h | 2 +- kernel/rcu/tree.c | 6 --- 4 files changed, 14 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 07bb02eda844..80adef7d4c3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void); void rcu_early_boot_tests(void); +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 805b6d59c854..85287ec054fd 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(); /* implies rcu_bh_inc() */ - barrier(); - rcu_dynticks_nesting = newval; -} - /* * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). + * entered that mode. */ void rcu_idle_enter(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_irq_exit(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit idle, so that we are no longer in an extended quiescent state. */ void rcu_idle_exit(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_irq_enter(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_enter); @@ -179,7 +89,7 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); */ bool notrace __rcu_is_watching(void) { - return rcu_dynticks_nesting; + return true; } EXPORT_SYMBOL(__rcu_is_watching); @@ -347,6 +257,11 @@ static void __call_rcu(struct rcu_head *head, rcp->curtail = &head->next; RCU_TRACE(rcp->qlen++); local_irq_restore(flags); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_sched_qs() */ + resched_cpu(0); + } } /* diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c56569127..b5fc5e5fc85d 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -147,7 +147,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9c25b99e2c33..203b50d7ecbd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -934,12 +934,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - /* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks -- cgit v1.2.3 From 87af9e7ff9d909e70a006ca0974466e2a1d8db0a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 12 Dec 2014 10:11:44 +0100 Subject: hotplugcpu: Avoid deadlocks by waking active_writer Commit b2c4623dcd07 ("rcu: More on deadlock between CPU hotplug and expedited grace periods") introduced another problem that can easily be reproduced by starting/stopping cpus in a loop. E.g.: for i in `seq 5000`; do echo 1 > /sys/devices/system/cpu/cpu1/online echo 0 > /sys/devices/system/cpu/cpu1/online done Will result in: INFO: task /cpu_start_stop:1 blocked for more than 120 seconds. Call Trace: ([<00000000006a028e>] __schedule+0x406/0x91c) [<0000000000130f60>] cpu_hotplug_begin+0xd0/0xd4 [<0000000000130ff6>] _cpu_up+0x3e/0x1c4 [<0000000000131232>] cpu_up+0xb6/0xd4 [<00000000004a5720>] device_online+0x80/0xc0 [<00000000004a57f0>] online_store+0x90/0xb0 ... And a deadlock. Problem is that if the last ref in put_online_cpus() can't get the cpu_hotplug.lock the puts_pending count is incremented, but a sleeping active_writer might never be woken up, therefore never exiting the loop in cpu_hotplug_begin(). This fix removes puts_pending and turns refcount into an atomic variable. We also introduce a wait queue for the active_writer, to avoid possible races and use-after-free. There is no need to take the lock in put_online_cpus() anymore. Can't reproduce it with this fix. Signed-off-by: David Hildenbrand Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 56 +++++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5d220234b3ca..1972b161c61e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,22 +58,23 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ + /* wait queue to wake up the active_writer */ + wait_queue_head_t wq; + /* verifies that no writer will get active while readers are active */ + struct mutex lock; /* * Also blocks the new readers during * an ongoing cpu hotplug operation. */ - int refcount; - /* And allows lockless put_online_cpus(). */ - atomic_t puts_pending; + atomic_t refcount; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } cpu_hotplug = { .active_writer = NULL, + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, #ifdef CONFIG_DEBUG_LOCK_ALLOC .dep_map = {.name = "cpu_hotplug.lock" }, #endif @@ -86,15 +87,6 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -static void apply_puts_pending(int max) -{ - int delta; - - if (atomic_read(&cpu_hotplug.puts_pending) >= max) { - delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); - cpu_hotplug.refcount -= delta; - } -} void get_online_cpus(void) { @@ -103,8 +95,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -116,8 +107,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); return true; } @@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus); void put_online_cpus(void) { + int refcount; + if (cpu_hotplug.active_writer == current) return; - if (!mutex_trylock(&cpu_hotplug.lock)) { - atomic_inc(&cpu_hotplug.puts_pending); - cpuhp_lock_release(); - return; - } - if (WARN_ON(!cpu_hotplug.refcount)) - cpu_hotplug.refcount++; /* try to fix things up */ + refcount = atomic_dec_return(&cpu_hotplug.refcount); + if (WARN_ON(refcount < 0)) /* try to fix things up */ + atomic_inc(&cpu_hotplug.refcount); + + if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) + wake_up(&cpu_hotplug.wq); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); cpuhp_lock_release(); } @@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus); */ void cpu_hotplug_begin(void) { - cpu_hotplug.active_writer = current; + DEFINE_WAIT(wait); + cpu_hotplug.active_writer = current; cpuhp_lock_acquire(); + for (;;) { mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(1); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); + if (likely(!atomic_read(&cpu_hotplug.refcount))) + break; mutex_unlock(&cpu_hotplug.lock); schedule(); } + finish_wait(&cpu_hotplug.wq, &wait); } void cpu_hotplug_done(void) -- cgit v1.2.3 From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2014 12:31:27 -0800 Subject: rcu: Fix rcu_barrier() race that could result in too-short wait The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions. It checks a given CPU's lists of callbacks, and if all three no-CBs lists are empty, ignores that CPU. However, these three lists could potentially be empty even when callbacks are present if the check executed just as the callbacks were being moved from one list to another. It turns out that recent versions of rcutorture can spot this race. This commit plugs this hole by consolidating the per-list counts of no-CBs callbacks into a single count, which is incremented before the corresponding callback is posted and after it is invoked. Then rcu_barrier() checks this single count to reliably determine whether the corresponding CPU has no-CBs callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 29 +++++++---------------------- kernel/rcu/tree_plugin.h | 45 ++++++++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 203b50d7ecbd..5b9f3b972a79 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3344,6 +3344,7 @@ static void _rcu_barrier(struct rcu_state *rsp) } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); + smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..cb5908672f11 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -340,14 +340,10 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ - atomic_long_t nocb_follower_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -356,8 +352,6 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - long nocb_gp_count; - long nocb_gp_count_lazy; bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ @@ -622,24 +616,15 @@ static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + - rdp->nocb_p_count + - atomic_long_read(&rdp->nocb_follower_count) + - rdp->nocb_p_count + rdp->nocb_gp_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + - rdp->nocb_p_count_lazy + - atomic_long_read(&rdp->nocb_follower_count_lazy) + - rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..e5c43b7f63f2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2056,9 +2056,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ + + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); - /* No-CBs CPUs might have callbacks on any of three lists. */ +#ifdef CONFIG_PROVE_RCU rhp = ACCESS_ONCE(rdp->nocb_head); if (!rhp) rhp = ACCESS_ONCE(rdp->nocb_gp_head); @@ -2072,8 +2089,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) cpu, rhp->func); WARN_ON_ONCE(1); } +#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!rhp; + return !!ret; } /* @@ -2095,9 +2113,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ @@ -2288,9 +2307,6 @@ wait_again: /* Move callbacks to wait-for-GP list, which is empty. */ ACCESS_ONCE(rdp->nocb_head) = NULL; rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); - rdp->nocb_gp_count_lazy = - atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); gotcbs = true; } @@ -2338,9 +2354,6 @@ wait_again: /* Append callbacks to follower's "done" list. */ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); *tail = rdp->nocb_gp_head; - atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); - atomic_long_add(rdp->nocb_gp_count_lazy, - &rdp->nocb_follower_count_lazy); smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* @@ -2415,13 +2428,11 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); ACCESS_ONCE(rdp->nocb_follower_head) = NULL; tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); - c = atomic_long_xchg(&rdp->nocb_follower_count, 0); - cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); - rdp->nocb_p_count += c; - rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2443,9 +2454,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) = - rdp->nocb_p_count_lazy - cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; -- cgit v1.2.3 From 5a43b88e98eaca88fa12abcacd0000adc879dd2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Dec 2014 17:55:28 +0800 Subject: rcu: Remove "select IRQ_WORK" from config TREE_RCU The 48a7639ce80c ("rcu: Make callers awaken grace-period kthread") removed the irq_work_queue(), so the TREE_RCU doesn't need irq work any more. This commit therefore updates RCU's Kconfig and Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- init/Kconfig | 2 -- kernel/rcu/tree.h | 1 - 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 9afb971497f4..39b4313c6ca6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -470,7 +470,6 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" depends on !PREEMPT && SMP - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -480,7 +479,6 @@ config TREE_RCU config PREEMPT_RCU bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index cb5908672f11..41860b9a04a2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include #include #include -#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and -- cgit v1.2.3 From b08ea27d95bcaee6d9cf4edd64f373006661424a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Oct 2014 15:39:39 -0700 Subject: rcu: Protect rcu_boost() lockless accesses with ACCESS_ONCE() This commit prevents random compiler optimizations by applying ACCESS_ONCE() to lockless accesses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..d59913ef8360 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1127,7 +1127,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v1.2.3 From 74e871ac6cb17f67cbefb569d98a8d05de666e07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Oct 2014 21:08:53 -0700 Subject: rcu: Rename "empty" to "empty_norm" in preparation for boost rework This commit undertakes a simple variable renaming to make way for some rework of RCU priority boosting. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d59913ef8360..3d22f0b2dea9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -313,8 +313,8 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; int empty_exp; + int empty_norm; int empty_exp_now; unsigned long flags; struct list_head *np; @@ -367,7 +367,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -393,7 +393,7 @@ void rcu_read_unlock_special(struct task_struct *t) * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, -- cgit v1.2.3 From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 11:22:37 -0700 Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() in preparation for the rework of RCU priority boosting. This new function will be invoked from rcu_read_unlock_special() in the reworked scheme, which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node structure's ->qsmaskinit field has already been updated. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 67 ++++++++++++++++++++++++++++++++++-------------- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 17 ++++++++++++ 3 files changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..75c6b3301abb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2226,6 +2226,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) TPS("cpuofl")); } +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2236,7 +2276,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2252,24 +2291,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinit &= ~rdp->grpmask; + if (rnp->qsmaskinit == 0) { + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + rcu_cleanup_dead_rnp(rnp); + } /* * We still hold the leaf rcu_node structure lock here, and diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..9315477b47d9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -552,6 +552,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3d22f0b2dea9..d044b9cbbd97 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -306,6 +306,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, return np; } +/* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return false; +} + #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* -- cgit v1.2.3 From b6a932d1d9840727eee619d455bdeeedaa205be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:05:04 -0700 Subject: rcu: Make rcu_read_unlock_special() propagate ->qsmaskinit bit clearing This commit causes rcu_read_unlock_special() to propagate ->qsmaskinit bit clearing up the rcu_node tree once a given rcu_node structure's blkd_tasks list becomes empty. This is the final commit in preparation for the rework of RCU priority boosting: It enables preempted tasks to remain queued on their rcu_node structure even after all of that rcu_node structure's CPUs have gone offline. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ kernel/rcu/tree_plugin.h | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75c6b3301abb..6625a1b5d9a1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2329,6 +2329,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } +static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d044b9cbbd97..8a2b84157d34 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -322,9 +322,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) */ void rcu_read_unlock_special(struct task_struct *t) { - int empty_exp; - int empty_norm; - int empty_exp_now; + bool empty; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -376,6 +377,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + empty = !rcu_preempt_has_tasks(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -395,6 +397,14 @@ void rcu_read_unlock_special(struct task_struct *t) drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ + /* + * If this was the last task on the list, go see if we + * need to propagate ->qsmaskinit bit clearing up the + * rcu_node tree. + */ + if (!empty && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. -- cgit v1.2.3 From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:56:16 -0700 Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs offline When the last CPU associated with a given leaf rcu_node structure goes offline, something must be done about the tasks queued on that rcu_node structure. Each of these tasks has been preempted on one of the leaf rcu_node structure's CPUs while in an RCU read-side critical section that it have not yet exited. Handling these tasks is the job of rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node structure to the root rcu_node structure. Unfortunately, this migration has to be done one task at a time because each tasks allegiance must be shifted from the original leaf rcu_node to the root, so that future attempts to deal with these tasks will acquire the root rcu_node structure's ->lock rather than that of the leaf. Worse yet, this migration must be done with interrupts disabled, which is not so good for realtime response, especially given that there is no bound on the number of tasks on a given rcu_node structure's list. (OK, OK, there is a bound, it is just that it is unreasonably large, especially on 64-bit systems.) This was not considered a problem back when rcu_preempt_offline_tasks() was first written because realtime systems were assumed not to do CPU-hotplug operations while real-time applications were running. This assumption has proved of dubious validity given that people are starting to run multiple realtime applications on a single SMP system and that it is common practice to offline then online a CPU before starting its real-time application in order to clear extraneous processing off of that CPU. So we now need CPU hotplug operations to avoid undue latencies. This commit therefore avoids migrating these tasks, instead letting them be dequeued one by one from the original leaf rcu_node structure by rcu_read_unlock_special(). This means that the clearing of bits from the upper-level rcu_node structures must be deferred until the last such task has been dequeued, because otherwise subsequent grace periods won't wait on them. This commit has the beneficial side effect of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in CONFIG_RCU_BOOST builds. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +------- kernel/rcu/tree.h | 18 ------- kernel/rcu/tree_plugin.h | 126 +---------------------------------------------- 3 files changed, 4 insertions(+), 160 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6625a1b5d9a1..84f16cf05991 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2276,7 +2276,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2295,25 +2294,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinit &= ~rdp->grpmask; - if (rnp->qsmaskinit == 0) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - } - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + raw_spin_unlock_irqrestore(&rnp->lock, flags); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9315477b47d9..883ebc8e2b6e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -514,13 +514,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -550,24 +543,13 @@ long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8a2b84157d34..d594da48f4b4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v1.2.3 From a8f4cbadfb5c5f7f5d9eef70821276a4e2e9289d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 13:48:41 -0700 Subject: rcu: Shorten irq-disable region in rcu_cleanup_dead_cpu() Now that we are not migrating callbacks, there is no need to hold the ->orphan_lock across the the ->qsmaskinit bit-clearing process. This commit therefore releases ->orphan_lock immediately after adopting the orphaned RCU callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84f16cf05991..990b406faf4e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2289,14 +2289,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinit &= ~rdp->grpmask; if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", -- cgit v1.2.3 From 96e92021d4fad8543d598b5e4926cb34fd40c4c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 14:09:23 -0700 Subject: rcu: Make use of rcu_preempt_has_tasks() Given that there is now arcu_preempt_has_tasks() function that checks to see if the ->blkd_tasks list is non-empty, this commit makes use of it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d594da48f4b4..bcc6d9134d47 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -540,7 +540,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -706,7 +706,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rnp->exp_tasks = rnp->blkd_tasks.next; @@ -985,7 +985,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; -- cgit v1.2.3 From 3e9f5c70d85060ff0db71826e2048daffec336e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2014 18:15:17 -0800 Subject: rcu: Don't spawn rcub kthreads on root rcu_node structure Now that offlining CPUs no longer moves leaf rcu_node structures' ->blkd_tasks lists to the root, there is no way for the root rcu_node structure's ->blkd_task list to be nonempty, unless the root node is also the sole leaf node. This commit therefore refrains from creating an rcub kthread for the root rcu_node structure unless it is also the sole leaf. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bcc6d9134d47..3e64bb197b1a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1352,12 +1352,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } static void rcu_prepare_kthreads(int cpu) -- cgit v1.2.3 From 1be0085b515e786e962b9f2d03616209eb6eb9a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2014 18:20:20 -0800 Subject: rcu: Don't initiate RCU priority boosting on root rcu_node Because there is no longer any preempted tasks on the root rcu_node, and because there is no longer ever an rcub kthread for the root rcu_node, this commit drops the code in force_qs_rnp() that attempts to awaken the non-existent root rcub kthread. This is strictly a performance enhancement, removing a root rcu_node ->lock acquisition and release along with some tests in rcu_initiate_boost(), ending with the test that notes that there is no rcub kthread. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 990b406faf4e..5a0a4c969d38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2513,12 +2513,6 @@ static void force_qs_rnp(struct rcu_state *rsp, } raw_spin_unlock_irqrestore(&rnp->lock, flags); } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } } /* -- cgit v1.2.3 From 5d0b024973027556b48a09bb36b55dc853ec7c6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Nov 2014 08:07:08 -0800 Subject: rcu: Don't bother affinitying rcub kthreads away from offline CPUs When rcu_boost_kthread_setaffinity() sees that all CPUs for a given rcu_node structure are now offline, it affinities the corresponding RCU-boost ("rcub") kthread away from those CPUs. This is pointless because the kthread cannot run on those offline CPUs in any case. This commit therefore removes this unneeded code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e64bb197b1a..1fac68220999 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1322,12 +1322,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -- cgit v1.2.3 From 3ba4d0e09bf965297e97adf195e0ea246cfe5c74 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Nov 2014 09:57:51 -0800 Subject: rcu: Note quiescent state when CPU goes offline The rcu_cleanup_dead_cpu() function (called after a CPU has gone completely offline) has not reported a quiescent state because there was probably at least one synchronize_rcu() between the time the CPU went offline and the CPU_DEAD notifier, and this would have detected the CPU's offline state via quiescent-state forcing. However, the plan is for CPUs to take themselves offline, at which point it makes sense for them to report their own quiescent state. This commit makes this change in preparation for the new CPU-hotplug setup. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5a0a4c969d38..24d3c67f7be7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2297,7 +2297,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rnp->qsmaskinit &= ~rdp->grpmask; if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); -- cgit v1.2.3 From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Nov 2014 16:30:01 +0800 Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid priority-inversion The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex") ensured rcu-boost safe even the rt_mutex has post-unlock reference. But rt_mutex allowing post-unlock reference is definitely a bug and it was fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race"). This fix made the previous patch (dfeb9765ce3c) useless. And even worse, the priority-inversion introduced by the the previous patch still exists. rcu_read_unlock_special() { rt_mutex_unlock(&rnp->boost_mtx); /* Priority-Inversion: * the current task had been deboosted and preempted as a low * priority task immediately, it could wait long before reschedule in, * and the rcu-booster also waits on this low priority task and sleeps. * This priority-inversion makes rcu-booster can't work * as expected. */ complete(&rnp->boost_completion); } Just revert the patch to avoid it. Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 5 ----- kernel/rcu/tree_plugin.h | 8 +------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 883ebc8e2b6e..95356477d560 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,11 +172,6 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ - struct completion boost_completion; - /* Used to ensure that the rt_mutex used */ - /* to carry out the boosting is fully */ - /* released with no future boostee accesses */ - /* before that rt_mutex is re-initialized. */ struct rt_mutex boost_mtx; /* Used only for the priority-boosting */ /* side effect, not as a lock. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1fac68220999..625e26040e6b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } -- cgit v1.2.3 From 6cd534ef8bfe422a5a847b953d1039841509c374 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Dec 2014 07:45:13 -0800 Subject: rcu: Don't scan root rcu_node structure for stalled tasks Now that blocked tasks are no longer migrated to the root rcu_node structure, there is no need to scan the root rcu_node structure for blocked tasks stalling the current grace period. This commit therefore removes this scan. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 24d3c67f7be7..4ed2c2842103 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1107,15 +1107,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; -- cgit v1.2.3 From ab954c167ed9ac107a8beb1d6e1745ae698a3421 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 18:25:59 -0800 Subject: rcu: Remove redundant callback-list initialization The RCU callback lists are initialized in both rcu_boot_init_percpu_data() and rcu_init_percpu_data(). The former is intended for initializing immutable data, so this commit removes the initialization from rcu_boot_init_percpu_data() and leaves it in rcu_init_percpu_data(). This change prepares for permitting callbacks to be queued very early in boot. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ed2c2842103..febd0f72a3c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3419,9 +3419,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -- cgit v1.2.3 From a5c198f4f7da6cc48116ca239c59c9f44b753364 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 23 Nov 2014 20:30:06 -0800 Subject: rcu: Expand SRCU ->completed to 64 bits When rcutorture used only the low-order 32 bits of the grace-period number, it was not a problem for SRCU to use a 32-bit completed field. However, rcutorture now uses the full 64 bits on 64-bit systems, so this commit converts SRCU's ->completed field to unsigned long so as to provide 64 bits on 64-bit systems. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 4 ++-- kernel/rcu/srcu.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a2783cb5d275..ef923dd96249 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -45,7 +45,7 @@ struct rcu_batch { #define RCU_BATCH_INIT(name) { NULL, &(name.head) } struct srcu_struct { - unsigned completed; + unsigned long completed; struct srcu_struct_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ bool running; @@ -135,7 +135,7 @@ int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); void synchronize_srcu_expedited(struct srcu_struct *sp); -long srcu_batches_completed(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index e037f3eb2f7b..445bf8ffe3fb 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } -- cgit v1.2.3 From 83fe27ea531161a655f02dc7732d14cfaa27fd5d Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 5 Dec 2014 11:24:45 -0500 Subject: rcu: Make SRCU optional by using CONFIG_SRCU SRCU is not necessary to be compiled by default in all cases. For tinification efforts not compiling SRCU unless necessary is desirable. The current patch tries to make compiling SRCU optional by introducing a new Kconfig option CONFIG_SRCU which is selected when any of the components making use of SRCU are selected. If we do not select CONFIG_SRCU, srcu.o will not be compiled at all. text data bss dec hex filename 2007 0 0 2007 7d7 kernel/rcu/srcu.o Size of arch/powerpc/boot/zImage changes from text data bss dec hex filename 831552 64180 23944 919676 e087c arch/powerpc/boot/zImage : before 829504 64180 23952 917636 e0084 arch/powerpc/boot/zImage : after so the savings are about ~2000 bytes. Signed-off-by: Pranith Kumar CC: Paul E. McKenney CC: Josh Triplett CC: Lai Jiangshan Signed-off-by: Paul E. McKenney [ paulmck: resolve conflict due to removal of arch/ia64/kvm/Kconfig. ] --- arch/arm/kvm/Kconfig | 1 + arch/arm64/kvm/Kconfig | 1 + arch/mips/kvm/Kconfig | 1 + arch/powerpc/kvm/Kconfig | 1 + arch/s390/kvm/Kconfig | 1 + arch/tile/kvm/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + drivers/clk/Kconfig | 1 + drivers/cpufreq/Kconfig | 1 + drivers/devfreq/Kconfig | 1 + drivers/md/Kconfig | 1 + drivers/net/Kconfig | 1 + fs/btrfs/Kconfig | 1 + fs/notify/Kconfig | 1 + fs/quota/Kconfig | 1 + init/Kconfig | 9 +++++++++ kernel/notifier.c | 3 +++ kernel/power/Kconfig | 1 + kernel/rcu/Makefile | 3 ++- lib/Kconfig.debug | 1 + mm/Kconfig | 1 + security/tomoyo/Kconfig | 1 + 23 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd299b1a8..3afee5f40f4f 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST + select SRCU depends on ARM_VIRT_EXT && ARM_LPAE ---help--- Support hosting virtualized guest machines. You will also diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ba85e9ea388..b334084d3675 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select KVM_ARM_HOST select KVM_ARM_VGIC select KVM_ARM_TIMER + select SRCU ---help--- Support hosting virtualized guest machines. diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 30e334e823bd..2ae12825529f 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO + select SRCU ---help--- Support for hosting Guest kernels. Currently supported on MIPS32 processors. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index f5769f19ae25..11850f310fb4 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_EVENTFD + select SRCU config KVM_BOOK3S_HANDLER bool diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 646db9c467d1..5fce52cf0e57 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING + select SRCU ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig index 2298cb1daff7..1e968f7550dc 100644 --- a/arch/tile/kvm/Kconfig +++ b/arch/tile/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM depends on HAVE_KVM && MODULES select PREEMPT_NOTIFIERS select ANON_INODES + select SRCU ---help--- Support hosting paravirtualized guest machines. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba397bde7948..661269953c1a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -138,6 +138,7 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select ACPI_LEGACY_TABLES_LOOKUP if ACPI select X86_FEATURE_NAMES if PROC_FS + select SRCU config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f9d16ff56c6b..7dc7ba577ecd 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,6 +40,7 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_VFIO + select SRCU ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 3f44f292d066..91f86131bb7a 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -13,6 +13,7 @@ config COMMON_CLK bool select HAVE_CLK_PREPARE select CLKDEV_LOOKUP + select SRCU ---help--- The common clock framework is a single definition of struct clk, useful across many platforms, as well as an diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 29b2ef5a68b9..a171fef2c2b6 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -2,6 +2,7 @@ menu "CPU Frequency scaling" config CPU_FREQ bool "CPU Frequency scaling" + select SRCU help CPU Frequency scaling allows you to change the clock speed of CPUs on the fly. This is a nice method to save power, because diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index faf4e70c42e0..3891f6781298 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -1,5 +1,6 @@ menuconfig PM_DEVFREQ bool "Generic Dynamic Voltage and Frequency Scaling (DVFS) support" + select SRCU help A device may have a list of frequencies and voltages available. devfreq, a generic DVFS framework can be registered for a device diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 5bdedf6df153..c355a226a024 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -5,6 +5,7 @@ menuconfig MD bool "Multiple devices driver support (RAID and LVM)" depends on BLOCK + select SRCU help Support multiple physical spindles through a single logical device. Required for RAID and logical volume management. diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index d6607ee9c855..84673ebcf428 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -197,6 +197,7 @@ config NETCONSOLE_DYNAMIC config NETPOLL def_bool NETCONSOLE + select SRCU config NET_POLL_CONTROLLER def_bool NETPOLL diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a66768ebc8d1..80e9c18ea64f 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -8,6 +8,7 @@ config BTRFS_FS select LZO_DECOMPRESS select RAID6_PQ select XOR_BLOCKS + select SRCU help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629eedd82..2a24249b30af 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,5 +1,6 @@ config FSNOTIFY def_bool n + select SRCU source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index c51df1dd237e..4a09975aac90 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -5,6 +5,7 @@ config QUOTA bool "Quota support" select QUOTACTL + select SRCU help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the diff --git a/init/Kconfig b/init/Kconfig index 9afb971497f4..f085969ba340 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -501,9 +501,17 @@ config TINY_RCU endchoice +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + config TASKS_RCU bool "Task_based RCU implementation using voluntary context switch" default n + select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and @@ -1595,6 +1603,7 @@ config PERF_EVENTS depends on HAVE_PERF_EVENTS select ANON_INODES select IRQ_WORK + select SRCU help Enable kernel support for various performance events provided by software and hardware. diff --git a/kernel/notifier.c b/kernel/notifier.c index 4803da6eab62..ae9fc7cc360e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); +#endif /* CONFIG_SRCU */ + static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 48b28d387c7f..7e01f78f0417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -251,6 +251,7 @@ config APM_EMULATION config PM_OPP bool + select SRCU ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index e6fae503d1bc..50a808424b06 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,5 @@ -obj-y += update.o srcu.o +obj-y += update.o +obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5f2ce616c046..7a9c93e327f2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1215,6 +1215,7 @@ config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST + select SRCU default n help This option provides a kernel module that runs torture tests diff --git a/mm/Kconfig b/mm/Kconfig index 1d1ae6b078fd..4395b12869c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -325,6 +325,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool + select SRCU config KSM bool "Enable KSM for page merging" diff --git a/security/tomoyo/Kconfig b/security/tomoyo/Kconfig index 8eb779b9d77f..604e718d68d3 100644 --- a/security/tomoyo/Kconfig +++ b/security/tomoyo/Kconfig @@ -5,6 +5,7 @@ config SECURITY_TOMOYO select SECURITYFS select SECURITY_PATH select SECURITY_NETWORK + select SRCU default n help This selects TOMOYO Linux, pathname-based access control. -- cgit v1.2.3 From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 09:57:48 -0800 Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts One way that an RCU CPU stall warning can happen is if the grace-period kthread is not allowed to execute. One proxy for this kthread's forward progress is the number of force-quiescent-state (fqs) scans. This commit therefore adds the number of fqs scans to the RCU CPU stall warning printouts when CONFIG_RCU_CPU_STALL_INFO=y. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..654b15be1e36 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1043,6 +1043,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..e300848cc0cf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -492,6 +492,8 @@ struct rcu_state { /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..769384d77437 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } -- cgit v1.2.3 From 6ccd2ecd422644277b7d8b37222e3af3f43ea9ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Dec 2014 10:20:59 -0800 Subject: rcu: Improve diagnostics for spurious RCU CPU stall warnings The current RCU CPU stall warning code will print "Stall ended before state dump start" any time that the stall-warning code is triggered on a CPU that has already reported a quiescent state for the current grace period and if all quiescent states have been reported for the current grace period. However, a true stall can result in these symptoms, for example, by preventing RCU's grace-period kthreads from ever running This commit therefore checks for this condition, reporting the end of the stall only if one of the grace-period counters has actually advanced. Otherwise, it reports the last time that the grace-period kthread made meaningful progress. (In normal situations, the grace-period kthread should make meaningful progress at least every jiffies_till_next_fqs jiffies.) Reported-by: Miroslav Benes Signed-off-by: Paul E. McKenney Tested-by: Miroslav Benes --- Documentation/RCU/stallwarn.txt | 5 +++++ kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 ++ 3 files changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index ed186a902d31..55f9707fe60a 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -187,6 +187,11 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the behavior, you might need to replace some of the cond_resched() calls with calls to cond_resched_rcu_qs(). +o Anything that prevents RCU's grace-period kthreads from running. + This can result in the "All QSes seen" console-log message. + This message will include information on when the kthread last + ran and how often it should be expected to run. + o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 654b15be1e36..a2ceb66bcd67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ @@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); @@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } mutex_unlock(&rsp->onoff_mutex); @@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg) ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e300848cc0cf..5ec81cf938fd 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,8 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit v1.2.3 From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 20:26:55 -0800 Subject: rcu: Handle gpnum/completed wrap while dyntick idle Subtle race conditions can result if a CPU stays in dyntick-idle mode long enough for the ->gpnum and ->completed fields to wrap. For example, consider the following sequence of events: o CPU 1 encounters a quiescent state while waiting for grace period 5 to complete, but then enters dyntick-idle mode. o While CPU 1 is in dyntick-idle mode, the grace-period counters wrap around so that the grace period number is now 4. o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes and grace period 5 begins. o The quiescent state that CPU 1 passed through during the old grace period 5 looks like it applies to the new grace period 5. Therefore, the new grace period 5 completes without CPU 1 having passed through a quiescent state. This could clearly be a fatal surprise to any long-running RCU read-side critical section that happened to be running on CPU 1 at the time. At one time, this was not a problem, given that it takes significant time for the grace-period counters to overflow even on 32-bit systems. However, with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long idle periods are now becoming quite feasible. It is therefore time to close this race. This commit therefore avoids this race condition by having the quiescent-state forcing code detect when a CPU is falling too far behind, and setting a new rcu_data field ->gpwrap when this happens. Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed fields are known to be untrustworthy, and can be ignored, along with any associated quiescent states. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 3 ++- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2ceb66bcd67..5987fdc85fc4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -930,6 +930,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } @@ -1577,7 +1580,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1592,7 +1596,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1603,6 +1607,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->passed_quiesce = 0; rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1616,7 +1621,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -2066,7 +2072,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -3190,7 +3196,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5ec81cf938fd..7472ff388d55 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -260,6 +260,7 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 769384d77437..81ff8b9a5a39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); -- cgit v1.2.3 From 83ac237a950e130c974d0170cce30891dcd8f250 Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Mon, 5 Jan 2015 22:34:09 -0500 Subject: livepatch: kconfig: use bool instead of boolean Keyword 'boolean' for type definition attributes is considered deprecated and should not be used anymore. No functional changes. Reference: http://lkml.kernel.org/r/cover.1418003065.git.cj@linux.com Reference: http://lkml.kernel.org/r/1419108071-11607-1-git-send-email-cj@linux.com Signed-off-by: Christoph Jaeger Reviewed-by: Petr Mladek Reviewed-by: Jingoo Han Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 96da00fbc120..66797aa597c0 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,10 +1,10 @@ config ARCH_HAVE_LIVE_PATCHING - boolean + bool help Arch supports kernel live patching config LIVE_PATCHING - boolean "Kernel Live Patching" + bool "Kernel Live Patching" depends on DYNAMIC_FTRACE_WITH_REGS depends on MODULES depends on SYSFS -- cgit v1.2.3 From 6ada1fc0e1c4775de0e043e1bd3ae9d065491aa5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 3 Dec 2014 19:22:48 -0500 Subject: time: settimeofday: Validate the values of tv from user An unvalidated user input is multiplied by a constant, which can result in an undefined behaviour for large values. While this is validated later, we should avoid triggering undefined behaviour. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: stable Signed-off-by: Sasha Levin [jstultz: include trivial milisecond->microsecond correction noticed by Andy] Signed-off-by: John Stultz --- include/linux/time.h | 13 +++++++++++++ kernel/time/time.c | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 8c42cf8d2444..5989b0ead1ec 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -99,6 +99,19 @@ static inline bool timespec_valid_strict(const struct timespec *ts) return true; } +static inline bool timeval_valid(const struct timeval *tv) +{ + /* Dates before 1970 are bogus */ + if (tv->tv_sec < 0) + return false; + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + + return true; +} + extern struct timespec timespec_trunc(struct timespec t, unsigned gran); #define CURRENT_TIME (current_kernel_time()) diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..22d5d3b73970 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } -- cgit v1.2.3 From 5e5aeb4367b450a28f447f6d5ab57d8f2ab16a5f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 3 Dec 2014 19:25:05 -0500 Subject: time: adjtimex: Validate the ADJ_FREQUENCY values Verify that the frequency value from userspace is valid and makes sense. Unverified values can cause overflows later on. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: stable Signed-off-by: Sasha Levin [jstultz: Fix up bug for negative values and drop redunent cap check] Signed-off-by: John Stultz --- kernel/time/ntp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..28bf91c60a0b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; + if (txc->modes & ADJ_FREQUENCY) { + if (LONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From b9dfe0bed999d23ee8838d389637dd8aef83fafa Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 9 Jan 2015 10:53:21 +0100 Subject: livepatch: handle ancient compilers with more grace We are aborting a build in case when gcc doesn't support fentry on x86_64 (regs->ip modification can't really reliably work with mcount). This however breaks allmodconfig for people with older gccs that don't support -mfentry. Turn the build-time failure into runtime failure, resulting in the whole infrastructure not being initialized if CC_USING_FENTRY is unset. Reported-by: Andrew Morton Signed-off-by: Jiri Kosina Signed-off-by: Andrew Morton Acked-by: Josh Poimboeuf --- arch/x86/include/asm/livepatch.h | 6 +++++- kernel/livepatch/core.c | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index b5608d7757fd..26e58134c8cb 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -25,9 +25,13 @@ #include #ifdef CONFIG_LIVE_PATCHING +static inline int klp_check_compiler_support(void) +{ #ifndef CC_USING_FENTRY -#error Your compiler must support -mfentry for live patching to work + return 1; #endif + return 0; +} extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6f6387912da7..ce42d3b930dc 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -911,6 +911,12 @@ static int klp_init(void) { int ret; + ret = klp_check_compiler_support(); + if (ret) { + pr_info("Your compiler is too old; turning off.\n"); + return -EINVAL; + } + ret = register_module_notifier(&klp_module_nb); if (ret) return ret; -- cgit v1.2.3 From 99590ba565a22c9f58f7528a94881d0455eef018 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 9 Jan 2015 14:03:04 -0600 Subject: livepatch: fix deferred module patching order When applying multiple patches to a module, if the module is loaded after the patches are loaded, the patches are applied in reverse order: $ insmod patch1.ko [ 43.172992] livepatch: enabling patch 'patch1' $ insmod patch2.ko [ 46.571563] livepatch: enabling patch 'patch2' $ modprobe nfsd [ 52.888922] livepatch: applying patch 'patch2' to loading module 'nfsd' [ 52.899847] livepatch: applying patch 'patch1' to loading module 'nfsd' Fix the loading order by storing the klp_patches list in queue order. Signed-off-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ce42d3b930dc..3d9c00b5223a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -739,7 +739,7 @@ static int klp_init_patch(struct klp_patch *patch) goto free; } - list_add(&patch->list, &klp_patches); + list_add_tail(&patch->list, &klp_patches); mutex_unlock(&klp_mutex); -- cgit v1.2.3 From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 12:49:13 -0800 Subject: rcu: Make _batches_completed() functions return unsigned long Long ago, the various ->completed fields were of type long, but now are unsigned long due to signed-integer-overflow concerns. However, the various _batches_completed() functions remained of type long, even though their only purpose in life is to return the corresponding ->completed field. This patch cleans this up by changing these functions' return types to unsigned long. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 4 ++-- include/linux/rcutree.h | 6 +++--- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e5366200154..91f7e4c37800 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu) /* * Return the number of grace periods. */ -static inline long rcu_batches_completed(void) +static inline unsigned long rcu_batches_completed(void) { return 0; } @@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void) /* * Return the number of bottom-half grace periods. */ -static inline long rcu_batches_completed_bh(void) +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 52953790dcca..9885bfb6b123 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -long rcu_batches_completed(void); -long rcu_batches_completed_bh(void); -long rcu_batches_completed_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..e26d78712e16 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -317,7 +317,7 @@ static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } @@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* * Return the number of RCU BH batches processed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..1a07d7379ac6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); +unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..f69300d4a51f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -static long rcu_batches_completed_preempt(void) +static unsigned long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_preempt(); } @@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void) /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_sched(); } -- cgit v1.2.3 From 6b80da42c02bc731ab38b6a37da5366abe26717f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 14:19:26 -0800 Subject: rcutorture: Use unsigned for Reader Batch computations The counter returned by the various ->completed functions is subject to overflow, which means that subtracting two such counters might result in overflow, which invokes undefined behavior in the C standard. This commit therefore changes these functions and variables to unsigned to avoid this undefined behavior. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4d559baf06e0..f43e3517f5f5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,7 +244,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - int (*completed)(void); + unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -296,7 +296,7 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static int rcu_torture_completed(void) +static unsigned long rcu_torture_completed(void) { return rcu_batches_completed(); } @@ -356,7 +356,7 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } -static int rcu_no_completed(void) +static unsigned long rcu_no_completed(void) { return 0; } @@ -407,7 +407,7 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static int rcu_bh_torture_completed(void) +static unsigned long rcu_bh_torture_completed(void) { return rcu_batches_completed_bh(); } @@ -510,7 +510,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) srcu_read_unlock(&srcu_ctl, idx); } -static int srcu_torture_completed(void) +static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(&srcu_ctl); } @@ -1015,8 +1015,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; - int completed; - int completed_end; + unsigned long completed; + unsigned long completed_end; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1073,8 +1073,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { - int completed; - int completed_end; + unsigned long completed; + unsigned long completed_end; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; -- cgit v1.2.3 From 1e32eaee4c370afca20324f634bf47ae991cd240 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 16:04:34 -0800 Subject: rcutorture: Drop rcu_torture_completed() and friends Now that the return type of rcu_batches_completed() and friends matches that of the rcu_torture_ops structure's ->completed field, the wrapper functions can be deleted. This commit carries out that deletion, while also wiring "sched"'s ->completed field to rcu_batches_completed_sched(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f43e3517f5f5..aadbc072ccf4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -296,11 +296,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static unsigned long rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -377,7 +372,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, + .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -407,11 +402,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static unsigned long rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - static void rcu_bh_torture_deferred_free(struct rcu_torture *p) { call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); @@ -423,7 +413,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, + .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -600,7 +590,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, + .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, -- cgit v1.2.3 From f9103c390257d06c162d9e3c2a90d2bdedadfe17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 16:48:20 -0800 Subject: rcu: Remove redundant rcu_batches_completed() declaration Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 1a07d7379ac6..69eb422445f9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 17:10:16 -0800 Subject: rcutorture: Check from beginning to end of grace period Currently, rcutorture's Reader Batch checks measure from the end of the previous grace period to the end of the current one. This commit tightens up these checks by measuring from the start and end of the same grace period. This involves adding rcu_batches_started() and friends corresponding to the existing rcu_batches_completed() and friends. We leave SRCU alone for the moment, as it does not yet have a way of tracking both ends of its grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 30 +++++++++++++++++++++++++++--- include/linux/rcutree.h | 3 +++ kernel/rcu/rcutorture.c | 37 +++++++++++++++++++++++++++---------- kernel/rcu/tree.c | 40 ++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree_plugin.h | 28 ---------------------------- 5 files changed, 95 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 1ce2d6b8f0c3..984192160e9b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -92,7 +92,31 @@ static inline void rcu_virt_note_context_switch(int cpu) } /* - * Return the number of grace periods. + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. */ static inline unsigned long rcu_batches_completed(void) { @@ -100,7 +124,7 @@ static inline unsigned long rcu_batches_completed(void) } /* - * Return the number of bottom-half grace periods. + * Return the number of bottom-half grace periods completed. */ static inline unsigned long rcu_batches_completed_bh(void) { @@ -108,7 +132,7 @@ static inline unsigned long rcu_batches_completed_bh(void) } /* - * Return the number of sched grace periods. + * Return the number of sched grace periods completed. */ static inline unsigned long rcu_batches_completed_sched(void) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9885bfb6b123..c0dd124e69ec 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -81,6 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aadbc072ccf4..24142c200901 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,6 +244,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); + unsigned long (*started)(void); unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); @@ -372,6 +373,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, + .started = rcu_batches_started, .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, @@ -413,6 +415,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, + .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, @@ -456,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -554,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .started = NULL, .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -590,6 +595,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, + .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, @@ -628,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, @@ -1005,8 +1012,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; + unsigned long started; unsigned long completed; - unsigned long completed_end; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1014,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused) unsigned long long ts; idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1037,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); + started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1063,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { + unsigned long started; unsigned long completed; - unsigned long completed_end; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; @@ -1083,7 +1095,10 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1104,14 +1119,16 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); + ts, started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e26d78712e16..c0faad51ae87 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -315,7 +315,43 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU-sched batches processed thus far for debug & stats. + * Return the number of RCU batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started(void) +{ + return rcu_state_p->gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started); + +/* + * Return the number of RCU-sched batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_sched(void) +{ + return rcu_sched_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_sched); + +/* + * Return the number of RCU BH batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_bh(void) +{ + return rcu_bh_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_bh); + +/* + * Return the number of RCU batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed(void) +{ + return rcu_state_p->completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU-sched batches completed thus far for debug & stats. */ unsigned long rcu_batches_completed_sched(void) { @@ -324,7 +360,7 @@ unsigned long rcu_batches_completed_sched(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* - * Return the number of RCU BH batches processed thus far for debug & stats. + * Return the number of RCU BH batches completed thus far for debug & stats. */ unsigned long rcu_batches_completed_bh(void) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f69300d4a51f..07e61a04de1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -113,25 +113,6 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } -/* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -static unsigned long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -932,15 +913,6 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - /* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit v1.2.3 From 7602de4af192b1527767538454557cb7564bae60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 18:39:54 -0800 Subject: rcutorture: Add more diagnostics in rcu_barrier() test failure case Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 24142c200901..30d42aa55d83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1427,6 +1427,9 @@ static int rcu_torture_barrier(void *arg) cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; + pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", + atomic_read(&barrier_cbs_invoked), + n_barrier_cbs); WARN_ON_ONCE(1); } n_barrier_successes++; -- cgit v1.2.3 From cbf6ab52add20b845f903decc973afbd5463c527 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Jan 2015 19:29:32 +0800 Subject: kprobes: Pass the original kprobe for preparing optimized kprobe Pass the original kprobe for preparing an optimized kprobe arch-dep part, since for some architecture (e.g. ARM32) requires the information in original kprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Wang Nan Signed-off-by: Jon Medhurst --- arch/x86/kernel/kprobes/opt.c | 3 ++- include/linux/kprobes.h | 3 ++- kernel/kprobes.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c523bbf3dc8..0dd8d089c315 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) * Target instructions MUST be relocatable (checked inside) * This is called when new aggr(opt)probe is allocated or reused. */ -int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) { u8 *buf; int ret; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 5297f9fa0ef2..1ab54754a86d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -308,7 +308,8 @@ struct optimized_kprobe { /* Architecture dependent functions for direct jump optimization */ extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn); extern int arch_check_optimized_kprobe(struct optimized_kprobe *op); -extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op); +extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *orig); extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op); extern void arch_optimize_kprobes(struct list_head *oplist); extern void arch_unoptimize_kprobes(struct list_head *oplist, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 06f58309fed2..bad4e959f2f7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p) struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ @@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); return &op->kp; } -- cgit v1.2.3 From 638476007d13534b2ed4134bf0279ef44071140b Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 16 Dec 2014 23:58:29 +0800 Subject: sched/fair: Fix the dealing with decay_count in __synchronize_entity_decay() In __synchronize_entity_decay(), if "decays" happens to be zero, se->avg.decay_count will not be zeroed, holding the positive value assigned when dequeued last time. This is problematic in the following case: If this runnable task is CFS-balanced to other CPUs soon afterwards, migrate_task_rq_fair() will treat it as a blocked task due to its non-zero decay_count, thereby adding its load to cfs_rq->removed_load wrongly. Thus, we must zero se->avg.decay_count in this case as well. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418745509-2609-1-git-send-email-pang.xunlei@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40667cbf371b..97000a99a293 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2574,11 +2574,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; return decays; } -- cgit v1.2.3 From a8b686b3af4419f92e0ea5be1c76fb68363df8e6 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 16 Dec 2014 16:25:28 -0600 Subject: sched/debug: Check for stack overflow in ___might_sleep() Sometimes a "BUG: sleeping function called from invalid context" message is not indicative of locking problems, but is the result of a stack overflow corrupting the thread info. Witness http://oss.sgi.com/archives/xfs/2014-02/msg00325.html for example, which took a few go-rounds to sort out. If we're printing the warning, things are wonky already, and it'd be informative to check for the stack end corruption at this point, too. Signed-off-by: Eric Sandeen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/5490B158.4060005@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..56c9b79772bd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7325,6 +7325,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) in_atomic(), irqs_disabled(), current->pid, current->comm); + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -- cgit v1.2.3 From 1f8a7633094b7886c0677b78ba60b82e501f3ce6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 5 Dec 2014 21:22:22 +0900 Subject: sched/debug: Fix potential call to __ffs(0) in sched_show_task() "struct task_struct"->state is "volatile long" and __ffs() warns that "Undefined if no bit exists, so code should check against 0 first." Therefore, at expression state = p->state ? __ffs(p->state) + 1 : 0; in sched_show_task(), CPU might see "p->state" before "?" as "non-zero" but "p->state" after "?" as "zero", which could result in "state >= sizeof(stat_nam)" being true and bogus '?' is printed. This patch changes "state" from "unsigned int" to "unsigned long" and save "p->state" before calling __ffs(), in order to avoid potential call to __ffs(0). Signed-off-by: Tetsuo Handa Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/201412052131.GCE35924.FVHFOtLOJOMQFS@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56c9b79772bd..816c17203c16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4508,9 +4508,10 @@ void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned state; + unsigned long state = p->state; - state = p->state ? __ffs(p->state) + 1 : 0; + if (state) + state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 -- cgit v1.2.3 From bb04159df99fa353d0fb524574aca03ce2c6515b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 15 Dec 2014 14:56:58 +0300 Subject: sched/fair: Fix sched_entity::avg::decay_count initialization Child has the same decay_count as parent. If it's not zero, we add it to parent's cfs_rq->removed_load: wake_up_new_task()->set_task_cpu()->migrate_task_rq_fair(). Child's load is a just garbade after copying of parent, it hasn't been on cfs_rq yet, and it must not be added to cfs_rq::removed_load in migrate_task_rq_fair(). The patch moves sched_entity::avg::decay_count intialization in sched_fork(). So, migrate_task_rq_fair() does not change removed_load. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418644618.6074.13.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 816c17203c16..95ac795ab3d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1832,6 +1832,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97000a99a293..2a0b302e51de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; p->se.avg.runnable_avg_sum = slice; p->se.avg.runnable_avg_period = slice; -- cgit v1.2.3 From 1b537c7d1e58c761212a193085f9049b58f672e6 Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Mon, 29 Dec 2014 14:41:43 +0800 Subject: sched/core: Remove check of p->sched_class Search all usage of p->sched_class in sched/core.c, no one check it before use, so it seems that every task must belong to one sched_class. Signed-off-by: Yao Dongdong [ Moved the early class assignment to make it boot. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1419835303-28958-1-git-send-email-yaodongdong@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95ac795ab3d3..46a2345f9f45 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4744,7 +4744,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -7253,6 +7253,11 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -7263,11 +7268,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ -- cgit v1.2.3 From cebde6d681aa45f96111cfcffc1544cf2a0454ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2015 11:18:10 +0100 Subject: sched/core: Validate rq_clock*() serialization rq->clock{,_task} are serialized by rq->lock, verify this. One immediate fail is the usage in scale_rt_capability, so 'annotate' that for now, there's more 'funny' there. Maybe change rq->lock into a raw_seqlock_t? (Only 32-bit is affected) Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150105103554.361872747@infradead.org Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2a0b302e51de..50ff90289293 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5948,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a2a45c970e7..bd2373273a9e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -687,13 +687,20 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} + static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } -- cgit v1.2.3 From 9edfbfed3f544a7830d99b341f0c175995a02950 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2015 11:18:11 +0100 Subject: sched/core: Rework rq->clock update skips The original purpose of rq::skip_clock_update was to avoid 'costly' clock updates for back to back wakeup-preempt pairs. The big problem with it has always been that the rq variable is unaware of the context and causes indiscrimiate clock skips. Rework the entire thing and create a sense of context by only allowing schedule() to skip clock updates. (XXX can we measure the cost of the added store?) By ensuring only schedule can ever skip an update, we guarantee we're never more than 1 tick behind on the update. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150105103554.432381549@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++++---- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 9 ++++++--- kernel/sched/sched.h | 15 +++++++++++++-- 4 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46a2345f9f45..b53cc859fc4f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update > 0) + lockdep_assert_held(&rq->lock); + + if (rq->clock_skip_update & RQCF_ACT_SKIP) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -1046,7 +1048,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP @@ -2779,6 +2781,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2803,13 +2807,13 @@ need_resched: switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev)) update_rq_clock(rq); next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->skip_clock_update = 0; + rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50ff90289293..2ecf779829f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5156,7 +5156,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } set_skip_buddy(se); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ee15f5a0d1c1..6725e3c49660 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled check_preempt_curr() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_skip_update(rq, false); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bd2373273a9e..0870db23d79c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -558,8 +558,6 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif - int skip_clock_update; - /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -588,6 +586,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; + unsigned int clock_skip_update; u64 clock; u64 clock_task; @@ -704,6 +703,18 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_skip_update |= RQCF_REQ_SKIP; + else + rq->clock_skip_update &= ~RQCF_REQ_SKIP; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, -- cgit v1.2.3 From 5a5375977b721503e4d6b37ab8982902cd2d10b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2015 11:18:12 +0100 Subject: sched/debug: Print rq->clock_task We seem to have forgotten adding it to the debug output like forever... do so now. Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150105103554.495253233@infradead.org Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92cc52001e74..8baaf858d25c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -305,6 +305,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); + PN(clock_task); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); -- cgit v1.2.3 From 63dc47e956b464e0ed3282f6e70974eebf850180 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:04 -0800 Subject: locking/mutex: Checking the stamp is WW only Mark it so by renaming __mutex_lock_check_stamp(). Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 454195194d4a..b042ea57bbea 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -469,7 +469,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); @@ -557,7 +557,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; } -- cgit v1.2.3 From e42f678a0237f84f0004fbaf0fad0b844751eadd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:05 -0800 Subject: locking/mutex: Move MCS related comments to proper location It serves much better if the comments are right before the osq_lock() call. Also delete a useless comment. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b042ea57bbea..6db3d0dea6da 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -193,17 +193,6 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. - * - */ - -/* - * Mutex spinning code migrated from kernel/sched/core.c - */ - static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { if (lock->owner != owner) @@ -307,6 +296,11 @@ static bool mutex_optimistic_spin(struct mutex *lock, if (!mutex_can_spin_on_owner(lock)) goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ if (!osq_lock(&lock->osq)) goto done; -- cgit v1.2.3 From 4bd19084faa61a8c68586e74f03f5776179f65c2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:06 -0800 Subject: locking/mutex: Introduce ww_mutex_set_context_slowpath() ... which is equivalent to the fastpath counter part. This mainly allows getting some WW specific code out of generic mutex paths. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 6db3d0dea6da..c67a60b61625 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, } /* - * after acquiring lock with fastpath or when we lost out in contested + * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. * * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, @@ -191,6 +191,30 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, spin_unlock_mutex(&lock->base.wait_lock, flags); } +/* + * After acquiring lock in the slowpath set ctx and wake up any + * waiters so they can recheck. + * + * Callers must hold the mutex wait_lock. + */ +static __always_inline void +ww_mutex_set_context_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + lock->ctx = ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } +} #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -576,23 +600,7 @@ skip_wait: if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + ww_mutex_set_context_slowpath(ww, ww_ctx); } spin_unlock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From d84b6728c54dcf73bcef3e3f7cf6767e2d224e39 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:07 -0800 Subject: locking/mcs: Better differentiate between MCS variants We have two flavors of the MCS spinlock: standard and cancelable (OSQ). While each one is independent of the other, we currently mix and match them. This patch: - Moves the OSQ code out of mcs_spinlock.h (which only deals with the traditional version) into include/linux/osq_lock.h. No unnecessary code is added to the more global header file, anything locks that make use of OSQ must include it anyway. - Renames mcs_spinlock.c to osq_lock.c. This file only contains osq code. - Introduces a CONFIG_LOCK_SPIN_ON_OWNER in order to only build osq_lock if there is support for it. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: "Paul E. McKenney" Cc: Jason Low Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Waiman Long Link: http://lkml.kernel.org/r/1420573509-24774-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/osq_lock.h | 12 ++- kernel/Kconfig.locks | 4 + kernel/locking/Makefile | 3 +- kernel/locking/mcs_spinlock.c | 208 ------------------------------------------ kernel/locking/mcs_spinlock.h | 16 ---- kernel/locking/osq_lock.c | 203 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 219 insertions(+), 227 deletions(-) delete mode 100644 kernel/locking/mcs_spinlock.c create mode 100644 kernel/locking/osq_lock.c (limited to 'kernel') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index 90230d5811c5..3a6490e81b28 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -5,8 +5,11 @@ * An MCS like lock especially tailored for optimistic spinning for sleeping * lock implementations (mutex, rwsem, etc). */ - -#define OSQ_UNLOCKED_VAL (0) +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; struct optimistic_spin_queue { /* @@ -16,6 +19,8 @@ struct optimistic_spin_queue { atomic_t tail; }; +#define OSQ_UNLOCKED_VAL (0) + /* Init macro and function. */ #define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } @@ -24,4 +29,7 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock) atomic_set(&lock->tail, OSQ_UNLOCKED_VAL); } +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); + #endif diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..08561f1acd13 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +config LOCK_SPIN_ON_OWNER + def_bool y + depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..4ca8eb151975 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c deleted file mode 100644 index 9887a905a762..000000000000 --- a/kernel/locking/mcs_spinlock.c +++ /dev/null @@ -1,208 +0,0 @@ -#include -#include -#include "mcs_spinlock.h" - -#ifdef CONFIG_SMP - -/* - * An MCS like lock especially tailored for optimistic spinning for sleeping - * lock implementations (mutex, rwsem, etc). - * - * Using a single mcs node per CPU is safe because sleeping locks should not be - * called from interrupt context and we have preemption disabled while - * spinning. - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); - -/* - * We use the value 0 to represent "no CPU", thus the encoded value - * will be the CPU number incremented by 1. - */ -static inline int encode_cpu(int cpu_nr) -{ - return cpu_nr + 1; -} - -static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) -{ - int cpu_nr = encoded_cpu_val - 1; - - return per_cpu_ptr(&osq_node, cpu_nr); -} - -/* - * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. - * Can return NULL in case we were the last queued and we updated @lock instead. - */ -static inline struct optimistic_spin_node * -osq_wait_next(struct optimistic_spin_queue *lock, - struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) -{ - struct optimistic_spin_node *next = NULL; - int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; - - for (;;) { - if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg(&lock->tail, curr, old) == curr) { - /* - * We were the last queued, we moved @lock back. @prev - * will now observe @lock and will complete its - * unlock()/unqueue(). - */ - break; - } - - /* - * We must xchg() the @node->next value, because if we were to - * leave it in, a concurrent unlock()/unqueue() from - * @node->next might complete Step-A and think its @prev is - * still valid. - * - * If the concurrent unlock()/unqueue() wins the race, we'll - * wait for either @lock to point to us, through its Step-B, or - * wait for a new @node->next from its Step-C. - */ - if (node->next) { - next = xchg(&node->next, NULL); - if (next) - break; - } - - cpu_relax_lowlatency(); - } - - return next; -} - -bool osq_lock(struct optimistic_spin_queue *lock) -{ - struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_node *prev, *next; - int curr = encode_cpu(smp_processor_id()); - int old; - - node->locked = 0; - node->next = NULL; - node->cpu = curr; - - old = atomic_xchg(&lock->tail, curr); - if (old == OSQ_UNLOCKED_VAL) - return true; - - prev = decode_cpu(old); - node->prev = prev; - ACCESS_ONCE(prev->next) = node; - - /* - * Normally @prev is untouchable after the above store; because at that - * moment unlock can proceed and wipe the node element from stack. - * - * However, since our nodes are static per-cpu storage, we're - * guaranteed their existence -- this allows us to apply - * cmpxchg in an attempt to undo our queueing. - */ - - while (!smp_load_acquire(&node->locked)) { - /* - * If we need to reschedule bail... so we can block. - */ - if (need_resched()) - goto unqueue; - - cpu_relax_lowlatency(); - } - return true; - -unqueue: - /* - * Step - A -- stabilize @prev - * - * Undo our @prev->next assignment; this will make @prev's - * unlock()/unqueue() wait for a next pointer since @lock points to us - * (or later). - */ - - for (;;) { - if (prev->next == node && - cmpxchg(&prev->next, node, NULL) == node) - break; - - /* - * We can only fail the cmpxchg() racing against an unlock(), - * in which case we should observe @node->locked becomming - * true. - */ - if (smp_load_acquire(&node->locked)) - return true; - - cpu_relax_lowlatency(); - - /* - * Or we race against a concurrent unqueue()'s step-B, in which - * case its step-C will write us a new @node->prev pointer. - */ - prev = ACCESS_ONCE(node->prev); - } - - /* - * Step - B -- stabilize @next - * - * Similar to unlock(), wait for @node->next or move @lock from @node - * back to @prev. - */ - - next = osq_wait_next(lock, node, prev); - if (!next) - return false; - - /* - * Step - C -- unlink - * - * @prev is stable because its still waiting for a new @prev->next - * pointer, @next is stable because our @node->next pointer is NULL and - * it will wait in Step-A. - */ - - ACCESS_ONCE(next->prev) = prev; - ACCESS_ONCE(prev->next) = next; - - return false; -} - -void osq_unlock(struct optimistic_spin_queue *lock) -{ - struct optimistic_spin_node *node, *next; - int curr = encode_cpu(smp_processor_id()); - - /* - * Fast path for the uncontended case. - */ - if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) - return; - - /* - * Second most likely case. - */ - node = this_cpu_ptr(&osq_node); - next = xchg(&node->next, NULL); - if (next) { - ACCESS_ONCE(next->locked) = 1; - return; - } - - next = osq_wait_next(lock, node, NULL); - if (next) - ACCESS_ONCE(next->locked) = 1; -} - -#endif - diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 4d60986fcbee..d1fe2ba5bac9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } -/* - * Cancellable version of the MCS lock above. - * - * Intended for adaptive spinning of sleeping locks: - * mutex_lock()/rwsem_down_{read,write}() etc. - */ - -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # value */ -}; - -extern bool osq_lock(struct optimistic_spin_queue *lock); -extern void osq_unlock(struct optimistic_spin_queue *lock); - #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c new file mode 100644 index 000000000000..ec83d4db8ec6 --- /dev/null +++ b/kernel/locking/osq_lock.c @@ -0,0 +1,203 @@ +#include +#include +#include + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); + +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) +{ + struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + + for (;;) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + cpu_relax_lowlatency(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; + + node->locked = 0; + node->next = NULL; + node->cpu = curr; + + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) + return true; + + prev = decode_cpu(old); + node->prev = prev; + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + cpu_relax_lowlatency(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + cpu_relax_lowlatency(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node, *next; + int curr = encode_cpu(smp_processor_id()); + + /* + * Fast path for the uncontended case. + */ + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + return; + + /* + * Second most likely case. + */ + node = this_cpu_ptr(&osq_node); + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} -- cgit v1.2.3 From 86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 16 Dec 2014 12:47:34 +0100 Subject: perf: Avoid horrible stack usage Both Linus (most recent) and Steve (a while ago) reported that perf related callbacks have massive stack bloat. The problem is that software events need a pt_regs in order to properly report the event location and unwind stack. And because we could not assume one was present we allocated one on stack and filled it with minimal bits required for operation. Now, pt_regs is quite large, so this is undesirable. Furthermore it turns out that most sites actually have a pt_regs pointer available, making this even more onerous, as the stack space is pointless waste. This patch addresses the problem by observing that software events have well defined nesting semantics, therefore we can use static per-cpu storage instead of on-stack. Linus made the further observation that all but the scheduler callers of perf_sw_event() have a pt_regs available, so we change the regular perf_sw_event() to require a valid pt_regs (where it used to be optional) and add perf_sw_event_sched() for the scheduler. We have a scheduler specific call instead of a more generic _noregs() like construct because we can assume non-recursion from the scheduler and thereby simplify the code further (_noregs would have to put the recursion context call inline in order to assertain which __perf_regs element to use). One last note on the implementation of perf_trace_buf_prepare(); we allow .regs = NULL for those cases where we already have a pt_regs pointer available and do not need another. Reported-by: Linus Torvalds Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Javi Merino Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Petr Mladek Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- include/linux/perf_event.h | 28 +++++++++++++++++++++------- include/trace/ftrace.h | 7 ++++--- kernel/events/core.c | 23 +++++++++++++++++------ kernel/sched/core.c | 2 +- kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 9 files changed, 52 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0bebb5c348b8..d36f68b08acc 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -595,7 +595,7 @@ extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); extern void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp); + struct pt_regs **regs, int *rctxp); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f7a61ca4b39..3a7bd80b4db8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event) extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs @@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { - struct pt_regs hot_regs; + if (static_key_false(&perf_swevent_enabled[event_id])) + __perf_sw_event(event_id, nr, regs, addr); +} + +DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); +/* + * 'Special' version for the scheduler, it hard assumes no recursion, + * which is guaranteed by us not actually scheduling inside other swevents + * because those disable preemption. + */ +static __always_inline void +perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) +{ if (static_key_false(&perf_swevent_enabled[event_id])) { - if (!regs) { - perf_fetch_caller_regs(&hot_regs); - regs = &hot_regs; - } - __perf_sw_event(event_id, nr, regs, addr); + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); + + perf_fetch_caller_regs(regs); + ___perf_sw_event(event_id, nr, regs, addr); } } @@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev, static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); @@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh) static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void +perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { } +static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 139b5067345b..27609dfcce25 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -763,7 +763,7 @@ perf_trace_##call(void *__data, proto) \ struct ftrace_event_call *event_call = __data; \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_raw_##call *entry; \ - struct pt_regs __regs; \ + struct pt_regs *__regs; \ u64 __addr = 0, __count = 1; \ struct task_struct *__task = NULL; \ struct hlist_head *head; \ @@ -782,18 +782,19 @@ perf_trace_##call(void *__data, proto) \ sizeof(u64)); \ __entry_size -= sizeof(u32); \ \ - perf_fetch_caller_regs(&__regs); \ entry = perf_trace_buf_prepare(__entry_size, \ event_call->event.type, &__regs, &rctx); \ if (!entry) \ return; \ \ + perf_fetch_caller_regs(__regs); \ + \ tstruct \ \ { assign; } \ \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ - __count, &__regs, head, __task); \ + __count, __regs, head, __task); \ } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 882f835a0d85..c10124b772c4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5889,6 +5889,8 @@ end: rcu_read_unlock(); } +DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); + int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); @@ -5904,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; - int rctx; - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) + if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); +} + +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (unlikely(rctx < 0)) + goto fail; + + ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); +fail: preempt_enable_notrace(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..d22fb16a7153 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1082,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); } __set_task_cpu(p, new_cpu); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4b9c114ee9de..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) } void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) + struct pt_regs **regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; + if (regs) + *regs = this_cpu_ptr(&__perf_regs[*rctxp]); raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5edb518be345..296079ae6583 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; @@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c6ee36fcbf90..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size -= sizeof(u32); rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, regs, &rctx); + sys_data->enter_event->event.type, NULL, &rctx); if (!rec) return; @@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size -= sizeof(u32); rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, regs, &rctx); + sys_data->exit_event->event.type, NULL, &rctx); if (!rec) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8520acc34b18..b11441321e7a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) goto out; -- cgit v1.2.3 From 036cc30c6b6af1cd42de6c34c4461f17da01cbf7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:09 -0800 Subject: locking/osq: No need for load/acquire when acquire-polling Both mutexes and rwsems took a performance hit when we switched over from the original mcs code to the cancelable variant (osq). The reason being the use of smp_load_acquire() when polling for node->locked. This is not needed as reordering is not an issue, as such, relax the barrier semantics. Paul describes the scenario nicely: https://lkml.org/lkml/2013/11/19/405 - If we start polling before the insertion is complete, all that happens is that the first few polls have no chance of seeing a lock grant. - Ordering the polling against the initialization -- the above xchg() is already doing that for us. The smp_load_acquire() when unqueuing make sense. In addition, we don't need to worry about leaking the critical region as osq is only used internally. This impacts both regular and large levels of concurrency, ie on a 40 core system with a disk intensive workload: disk-1 804.83 ( 0.00%) 828.16 ( 2.90%) disk-61 8063.45 ( 0.00%) 18181.82 (125.48%) disk-121 7187.41 ( 0.00%) 20119.17 (179.92%) disk-181 6933.32 ( 0.00%) 20509.91 (195.82%) disk-241 6850.81 ( 0.00%) 20397.80 (197.74%) disk-301 6815.22 ( 0.00%) 20287.58 (197.68%) disk-361 7080.40 ( 0.00%) 20205.22 (185.37%) disk-421 7076.13 ( 0.00%) 19957.33 (182.04%) disk-481 7083.25 ( 0.00%) 19784.06 (179.31%) disk-541 7038.39 ( 0.00%) 19610.92 (178.63%) disk-601 7072.04 ( 0.00%) 19464.53 (175.23%) disk-661 7010.97 ( 0.00%) 19348.23 (175.97%) disk-721 7069.44 ( 0.00%) 19255.33 (172.37%) disk-781 7007.58 ( 0.00%) 19103.14 (172.61%) disk-841 6981.18 ( 0.00%) 18964.22 (171.65%) disk-901 6968.47 ( 0.00%) 18826.72 (170.17%) disk-961 6964.61 ( 0.00%) 18708.02 (168.62%) Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-7-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index ec83d4db8ec6..c112d00341b0 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!smp_load_acquire(&node->locked)) { + while (!ACCESS_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ -- cgit v1.2.3 From 0f1ba9a2cea52896448ef4e14a4cb1880b8e5bee Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 7 Jan 2015 10:04:41 +0100 Subject: softirq/preempt: Add missing current->preempt_disable_ip update While debugging some "sleeping function called from invalid context" bug I realized that the debugging message "Preemption disabled at:" pointed to an incorrect function. In particular if the last function/action that disabled preemption was spin_lock_bh() then current->preempt_disable_ip won't be updated. The reason for this is that __local_bh_disable_ip() will increase preempt_count manually instead of calling preempt_count_add(), which would handle the update correctly. It look like the manual handling was done to work around some lockdep issue. So add the missing update of current->preempt_disable_ip to __local_bh_disable_ip() as well. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra (Intel) Cc: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150107090441.GC4365@osiris Signed-off-by: Ingo Molnar --- kernel/softirq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9ac1be..b9ae81643f1d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if (preempt_count() == cnt) { +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); +#endif trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } } EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -- cgit v1.2.3 From 28423ad283d5348793b0c45cc9b1af058e776fd6 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Tue, 13 Jan 2015 13:16:18 -0800 Subject: ksoftirqd: Enable IRQs and call cond_resched() before poking RCU While debugging an issue with excessive softirq usage, I encountered the following note in commit 3e339b5dae24a706 ("softirq: Use hotplug thread infrastructure"): [ paulmck: Call rcu_note_context_switch() with interrupts enabled. ] ...but despite this note, the patch still calls RCU with IRQs disabled. This seemingly innocuous change caused a significant regression in softirq CPU usage on the sending side of a large TCP transfer (~1 GB/s): when introducing 0.01% packet loss, the softirq usage would jump to around 25%, spiking as high as 50%. Before the change, the usage would never exceed 5%. Moving the call to rcu_note_context_switch() after the cond_sched() call, as it was originally before the hotplug patch, completely eliminated this problem. Signed-off-by: Calvin Owens Cc: stable@vger.kernel.org Signed-off-by: Paul E. McKenney --- kernel/softirq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9ac1be..c497fcdf0d1e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -656,9 +656,13 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(); local_irq_enable(); cond_resched(); + + preempt_disable(); + rcu_note_context_switch(); + preempt_enable(); + return; } local_irq_enable(); -- cgit v1.2.3 From 60479676eb6e9913176d93ebad194e3dc167bc63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Jan 2015 13:20:26 -0800 Subject: ksoftirqd: Use new cond_resched_rcu_qs() function Simplify run_ksoftirqd() by using the new cond_resched_rcu_qs() function that conditionally reschedules, but unconditionally supplies an RCU quiescent state. This commit is separate from the previous commit by Calvin Owens because Calvin's approach can be backported, while this commit cannot be. The reason that this commit cannot be backported is that cond_resched_rcu_qs() does not always provide the needed quiescent state in earlier kernels. Signed-off-by: Paul E. McKenney --- kernel/softirq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index c497fcdf0d1e..8cdb98847c7b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -657,12 +657,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched(); - - preempt_disable(); - rcu_note_context_switch(); - preempt_enable(); - + cond_resched_rcu_qs(); return; } local_irq_enable(); -- cgit v1.2.3 From a94844b22a2e2b9155bbc0878c507850477221c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Dec 2014 07:37:48 -0800 Subject: rcu: Optionally run grace-period kthreads at real-time priority Recent testing has shown that under heavy load, running RCU's grace-period kthreads at real-time priority can improve performance (according to 0day test robot) and reduce the incidence of RCU CPU stall warnings. However, most systems do just fine with the default non-realtime priorities for these kthreads, and it does not make sense to expose the entire user base to any risk stemming from this change, given that this change is of use only to a few users running extremely heavy workloads. Therefore, this commit allows users to specify realtime priorities for the grace-period kthreads, but leaves them running SCHED_OTHER by default. The realtime priority may be specified at build time via the RCU_KTHREAD_PRIO Kconfig parameter, or at boot time via the rcutree.kthread_prio parameter. Either way, 0 says to continue the default SCHED_OTHER behavior and values from 1-99 specify that priority of SCHED_FIFO behavior. Note that a value of 0 is not permitted when the RCU_BOOST Kconfig parameter is specified. Signed-off-by: Paul E. McKenney --- init/Kconfig | 7 ++++--- kernel/rcu/tree.c | 24 +++++++++++++++++++++++- kernel/rcu/tree_plugin.h | 4 ---- 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 9afb971497f4..d3ee66a6990f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -668,9 +668,10 @@ config RCU_BOOST config RCU_KTHREAD_PRIO int "Real-time priority to use for RCU worker threads" - range 1 99 - depends on RCU_BOOST - default 1 + range 1 99 if RCU_BOOST + range 0 99 if !RCU_BOOST + default 1 if RCU_BOOST + default 0 if !RCU_BOOST help This option specifies the SCHED_FIFO priority value that will be assigned to the rcuc/n and rcub/n threads and is also the value diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5987fdc85fc4..75ce12316b4c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -3597,17 +3601,35 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; + int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct rcu_state *rsp; + struct sched_param sp; struct task_struct *t; + /* Force priority into range. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + if (kthread_prio != kthread_prio_in) + pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", + kthread_prio, kthread_prio_in); + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); } rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 81ff8b9a5a39..dcb32638f88b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -34,10 +34,6 @@ #include "../locking/rtmutex_common.h" -/* rcuc/rcub kthread realtime priority */ -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -module_param(kthread_prio, int, 0644); - /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. -- cgit v1.2.3 From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Dec 2014 20:32:04 -0800 Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used in places where it would be useful for it to apply to the normal RCU flavors, rcu_preempt, rcu_sched, and rcu_bh. This is especially the case for workloads that aggressively overload the system, particularly those that generate large numbers of RCU updates on systems running NO_HZ_FULL CPUs. This commit therefore communicates quiescent states from cond_resched_rcu_qs() to the normal RCU flavors. Note that it is unfortunately necessary to leave the old ->passed_quiesce mechanism in place to allow quiescent states that apply to only one flavor to be recorded. (Yes, we could decrement ->rcu_qs_ctr_snap in that case, but that is not so good for debugging of RCU internals.) In addition, if one of the RCU flavor's grace period has stalled, this will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight quiescent state visible from other CPUs. Reported-by: Sasha Levin Reported-by: Dave Jones Signed-off-by: Paul E. McKenney [ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu() was used in preemptible code. ] --- Documentation/RCU/trace.txt | 32 ++++++++++++++++---------------- include/linux/rcupdate.h | 3 ++- include/linux/rcutiny.h | 5 ++++- include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 38 +++++++++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_trace.c | 8 ++++++-- 7 files changed, 65 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index b63b9bb3bc0c..08651da15448 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -56,14 +56,14 @@ rcuboost: The output of "cat rcu/rcu_preempt/rcudata" looks as follows: - 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 - 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 - 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 - 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 - 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 - 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 - 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 - 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 + 0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 + 1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 + 2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 + 3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 + 4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 + 5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 + 6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 + 7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 This file has one line per CPU, or eight for this 8-CPU system. The fields are as follows: @@ -188,14 +188,14 @@ o "ca" is the number of RCU callbacks that have been adopted by this Kernels compiled with CONFIG_RCU_BOOST=y display the following from /debug/rcu/rcu_preempt/rcudata: - 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 - 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 - 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 - 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 - 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 - 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 - 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 - 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 + 0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 + 1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 + 2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 + 3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 + 4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 + 5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 + 6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 + 7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 This is similar to the output discussed above, but contains the following additional fields: diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ed4f5939a452..468228750299 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void) extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ + rcu_all_qs(); \ if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e5366200154..fabd3fad8516 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void) return true; } - #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_all_qs(void) +{ +} + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 52953790dcca..ddba927f7316 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_all_qs(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75ce12316b4c..cb00e038c2f2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -288,6 +291,22 @@ void rcu_note_context_switch(void) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiesecent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desparate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); ACCESS_ONCE(rdp->gpwrap) = false; @@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = 0; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7472ff388d55..1e7f8b05714e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,6 +257,8 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e1beeb..fbb6240509ea 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, -- cgit v1.2.3 From fb81a44b88e6173ed0f6e9d6a1afa5305fb63f6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 08:35:02 -0800 Subject: rcu: Add GP-kthread-starvation checks to CPU stall warnings This commit adds a message that is printed if the relevant grace-period kthread has not been able to run for the two seconds preceding the stall warning. (The two seconds is double the maximum interval between successive bouts of quiescent-state forcing.) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 9 +++++++++ kernel/rcu/tree.c | 21 ++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 55f9707fe60a..53e7d2856db6 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -152,6 +152,15 @@ no non-lazy callbacks ("." is printed otherwise, as shown above) and "D" indicates that dyntick-idle processing is enabled ("." is printed otherwise, for example, if disabled via the "nohz=" kernel boot parameter). +If the relevant grace-period kthread has been unable to run prior to +the stall warning, the following additional line is printed: + + rcu_preempt kthread starved for 2023 jiffies! + +Starving the grace-period kthreads of CPU time can of course result in +RCU CPU stall warnings even when all CPUs and tasks have passed through +the required quiescent states. + Multiple Warnings From One Stall diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb00e038c2f2..e335f33d0b9f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1072,6 +1072,21 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); } +/* + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +{ + unsigned long gpa; + unsigned long j; + + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + if (j - gpa > 2 * HZ) + pr_err("%s kthread starved for %ld jiffies!\n", + rsp->name, j - gpa); +} + /* * Dump stacks of all tasks running on stalled CPUs. */ @@ -1169,9 +1184,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_check_gp_kthread_starvation(rsp); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1196,6 +1212,9 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); + + rcu_check_gp_kthread_starvation(rsp); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v1.2.3 From ec1fe396ff42e240c9b32111ee53665c5916fe5e Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Mon, 22 Dec 2014 11:10:12 -0800 Subject: rcu: Fix RCU CPU stall detection in tiny implementation The tiny RCU CPU stall detection depends on *rcp->curtail not being NULL. It is however a tail pointer and thus NULL by definition. Instead we should check rcp->rcucblist for the presence of pending callbacks which need to be processed. With this fix INFO about the stall is printed and jiffies_stall (jiffies at next stall) correctly updated. Note that the check for pending callback is necessary to avoid spurious warnings if there are no pendings callbacks. Signed-off-by: Miroslav Benes [ paulmck: Fused identical "if" statements, ported to -rcu. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny_plugin.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c56569127..80f908afe348 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->ticks_this_gp++; j = jiffies; js = ACCESS_ONCE(rcp->jiffies_stall); - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) + } else if (ULONG_CMP_GE(j, js)) { ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + } } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -- cgit v1.2.3 From 630181c4a915edb0761bb282c567d3abc8ca2f35 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Dec 2014 21:33:14 -0800 Subject: rcu: Initialize tiny RCU stall-warning timeouts at boot The current tiny RCU stall-warning code assumes that the jiffies counter starts at zero, however, it is sometimes initialized to other values, for example, -30,000. This commit therefore changes rcu_init() to invoke reset_cpu_stall_ticks() for both flavors of RCU to initialize the stall-warning times properly at boot. Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649f8817..fc0f2c07af3a 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -383,6 +383,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); rcu_early_boot_tests(); } -- cgit v1.2.3 From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Jan 2015 14:21:16 -0500 Subject: workqueue: fix subtle pool management issue which can stall whole worker_pool A worker_pool's forward progress is guaranteed by the fact that the last idle worker assumes the manager role to create more workers and summon the rescuers if creating workers doesn't succeed in timely manner before proceeding to execute work items. This manager role is implemented in manage_workers(), which indicates whether the worker may proceed to work item execution with its return value. This is necessary because multiple workers may contend for the manager role, and, if there already is a manager, others should proceed to work item execution. Unfortunately, the function also indicates that the worker may proceed to work item execution if need_to_create_worker() is false at the head of the function. need_to_create_worker() tests the following conditions. pending work items && !nr_running && !nr_idle The first and third conditions are protected by pool->lock and thus won't change while holding pool->lock; however, nr_running can change asynchronously as other workers block and resume and while it's likely to be zero, as someone woke this worker up in the first place, some other workers could have become runnable inbetween making it non-zero. If this happens, manage_worker() could return false even with zero nr_idle making the worker, the last idle one, proceed to execute work items. If then all workers of the pool end up blocking on a resource which can only be released by a work item which is pending on that pool, the whole pool can deadlock as there's no one to create more workers or summon the rescuers. This patch fixes the problem by removing the early exit condition from maybe_create_worker() and making manage_workers() return false iff there's already another manager, which ensures that the last worker doesn't start executing work items. We can leave the early exit condition alone and just ignore the return value but the only reason it was put there is because the manage_workers() used to perform both creations and destructions of workers and thus the function may be invoked while the pool is trying to reduce the number of workers. Now that manage_workers() is called only when more workers are needed, the only case this early exit condition is triggered is rare race conditions rendering it pointless. Tested with simulated workload and modified workqueue code which trigger the pool deadlock reliably without this patch. Signed-off-by: Tejun Heo Reported-by: Eric Sandeen Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net Cc: Dave Chinner Cc: Lai Jiangshan Cc: stable@vger.kernel.org --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6202b08f1933..beeeac9e0e3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { - if (!need_to_create_worker(pool)) - return false; restart: spin_unlock_irq(&pool->lock); @@ -1877,7 +1871,6 @@ restart: */ if (need_to_create_worker(pool)) goto restart; - return true; } /** @@ -1897,16 +1890,14 @@ restart: * multiple times. Does GFP_KERNEL allocations. * * Return: - * %false if the pool don't need management and the caller can safely start - * processing works, %true indicates that the function released pool->lock - * and reacquired it to perform some management function and that the - * conditions that the caller verified while holding the lock before - * calling the function might no longer be true. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Anyone who successfully grabs manager_arb wins the arbitration @@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker) * actual management, the pool may stall indefinitely. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; - ret |= maybe_create_worker(pool); + maybe_create_worker(pool); mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** -- cgit v1.2.3 From 053c095a82cf773075e83d7233b5cc19a1f73ece Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Jan 2015 22:09:00 +0100 Subject: netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/acpi/event.c | 7 +------ drivers/net/ethernet/rocker/rocker.c | 3 ++- drivers/net/vxlan.c | 3 ++- drivers/net/wireless/mac80211_hwsim.c | 3 ++- drivers/scsi/pmcraid.c | 8 +------- drivers/target/target_core_user.c | 4 +--- drivers/thermal/thermal_core.c | 6 +----- fs/dlm/netlink.c | 7 +------ include/net/genetlink.h | 4 ++-- include/net/netlink.h | 6 +----- kernel/taskstats.c | 13 ++----------- net/bridge/br_fdb.c | 3 ++- net/bridge/br_mdb.c | 3 ++- net/bridge/br_netlink.c | 3 ++- net/can/gw.c | 3 ++- net/core/fib_rules.c | 3 ++- net/core/neighbour.c | 12 ++++++++---- net/core/rtnetlink.c | 9 ++++++--- net/decnet/dn_dev.c | 3 ++- net/decnet/dn_route.c | 3 ++- net/decnet/dn_table.c | 3 ++- net/ieee802154/netlink.c | 12 ++---------- net/ieee802154/nl-mac.c | 3 ++- net/ieee802154/nl-phy.c | 3 ++- net/ieee802154/nl802154.c | 6 ++++-- net/ipv4/devinet.c | 8 +++++--- net/ipv4/fib_semantics.c | 3 ++- net/ipv4/inet_diag.c | 9 ++++++--- net/ipv4/ipmr.c | 3 ++- net/ipv4/route.c | 3 ++- net/ipv4/tcp_metrics.c | 3 ++- net/ipv6/addrconf.c | 32 +++++++++++++++++++------------- net/ipv6/addrlabel.c | 5 +++-- net/ipv6/ip6_fib.c | 1 - net/ipv6/ip6mr.c | 3 ++- net/ipv6/route.c | 3 ++- net/l2tp/l2tp_netlink.c | 10 ++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 9 ++++++--- net/netfilter/nf_tables_api.c | 18 ++++++++++++------ net/netlabel/netlabel_cipso_v4.c | 3 ++- net/netlabel/netlabel_mgmt.c | 6 ++++-- net/netlabel/netlabel_unlabeled.c | 3 ++- net/netlink/diag.c | 3 ++- net/netlink/genetlink.c | 6 ++++-- net/nfc/netlink.c | 12 +++++++----- net/openvswitch/datapath.c | 9 ++++++--- net/packet/diag.c | 3 ++- net/phonet/pn_netlink.c | 16 +++++++++++----- net/unix/diag.c | 3 ++- net/wireless/nl80211.c | 27 ++++++++++++++++++--------- net/xfrm/xfrm_user.c | 27 ++++++++++++++++++--------- 51 files changed, 203 insertions(+), 158 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index ef2d730734dc..e24ea4e796e4 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -100,7 +100,6 @@ int acpi_bus_generate_netlink_event(const char *device_class, struct acpi_genl_event *event; void *msg_header; int size; - int result; /* allocate memory */ size = nla_total_size(sizeof(struct acpi_genl_event)) + @@ -137,11 +136,7 @@ int acpi_bus_generate_netlink_event(const char *device_class, event->data = data; /* send multicast genetlink message */ - result = genlmsg_end(skb, msg_header); - if (result < 0) { - nlmsg_free(skb); - return result; - } + genlmsg_end(skb, msg_header); genlmsg_multicast(&acpi_event_genl_family, skb, 0, 0, GFP_ATOMIC); return 0; diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 964d719b150f..d54781e71cd4 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -3674,7 +3674,8 @@ static int rocker_fdb_fill_info(struct sk_buff *skb, if (vid && nla_put_u16(skb, NDA_VLAN, vid)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 6b6b45622a0a..c5f79e7513a6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -363,7 +363,8 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 494e7335aa64..4a4c6586a8d2 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2557,7 +2557,8 @@ static int mac80211_hwsim_get_radio(struct sk_buff *skb, if (res < 0) goto out_err; - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; out_err: genlmsg_cancel(skb, hdr); diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index 8c27b6a77ec4..cf222f46eac5 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1473,13 +1473,7 @@ static int pmcraid_notify_aen( } /* send genetlink multicast message to notify appplications */ - result = genlmsg_end(skb, msg_header); - - if (result < 0) { - pmcraid_err("genlmsg_end failed\n"); - nlmsg_free(skb); - return result; - } + genlmsg_end(skb, msg_header); result = genlmsg_multicast(&pmcraid_event_family, skb, 0, 0, GFP_ATOMIC); diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 1157b559683b..1a1bcf71ec9d 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -784,9 +784,7 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino if (ret < 0) goto free_skb; - ret = genlmsg_end(skb, msg_header); - if (ret < 0) - goto free_skb; + genlmsg_end(skb, msg_header); ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, TCMU_MCGRP_CONFIG, GFP_KERNEL); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 87e0b0782023..48491d1a81d6 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1759,11 +1759,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz, thermal_event->event = event; /* send multicast genetlink message */ - result = genlmsg_end(skb, msg_header); - if (result < 0) { - nlmsg_free(skb); - return result; - } + genlmsg_end(skb, msg_header); result = genlmsg_multicast(&thermal_event_genl_family, skb, 0, 0, GFP_ATOMIC); diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index e7cfbaf8d0e2..1e6e227134d7 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); void *data = genlmsg_data(genlhdr); - int rv; - rv = genlmsg_end(skb, data); - if (rv < 0) { - nlmsg_free(skb); - return rv; - } + genlmsg_end(skb, data); return genlmsg_unicast(&init_net, skb, listener_nlportid); } diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 84125088c309..f24aa83b80b6 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -245,9 +245,9 @@ static inline void *genlmsg_put_reply(struct sk_buff *skb, * @skb: socket buffer the message is stored in * @hdr: user specific header */ -static inline int genlmsg_end(struct sk_buff *skb, void *hdr) +static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { - return nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); + nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** diff --git a/include/net/netlink.h b/include/net/netlink.h index d5869b90bfbb..e010ee8da41d 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -490,14 +490,10 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) * Corrects the netlink message header to include the appeneded * attributes. Only necessary if attributes have been added to * the message. - * - * Returns the total data length of the skb. */ -static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) +static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; - - return skb->len; } /** diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 670fff88a961..21f82c29c914 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); - int rc; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return rc; - } + genlmsg_end(skb, reply); return genlmsg_reply(skb, info); } @@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb, void *reply = genlmsg_data(genlhdr); int rc, delcount = 0; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return; - } + genlmsg_end(skb, reply); rc = 0; down_read(&listeners->sem); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 03667e65cc29..08bf04bdac58 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -633,7 +633,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index fed61c971b17..409608960899 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -190,7 +190,8 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, nla_nest_end(skb, nest2); nla_nest_end(skb, nest); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; end: nla_nest_end(skb, nest); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 163950b10d8c..528cf2790a5f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -263,7 +263,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, } done: - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/can/gw.c b/net/can/gw.c index 295f62e62eb3..a6f448e18ea8 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -575,7 +575,8 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; cancel: nlmsg_cancel(skb, nlh); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 185c341fafbd..44706e81b2e0 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -609,7 +609,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8d614c93f86a..d36d564f149f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1884,7 +1884,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; read_unlock_bh(&tbl->lock); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: read_unlock_bh(&tbl->lock); @@ -1917,7 +1918,8 @@ static int neightbl_fill_param_info(struct sk_buff *skb, goto errout; read_unlock_bh(&tbl->lock); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; errout: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); @@ -2202,7 +2204,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2232,7 +2235,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eadc5c0e2dfa..e13b9dbdf154 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1199,7 +1199,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_nest_end(skb, af_spec); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2326,7 +2327,8 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2809,7 +2811,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, nla_nest_end(skb, protinfo); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 4400da7739da..b2c26b081134 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -702,7 +702,8 @@ static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || nla_put_u32(skb, IFA_FLAGS, ifa_flags)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index daccc4a36d80..812e5e6e88fb 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1616,7 +1616,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0) goto errout; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; errout: nlmsg_cancel(skb, nlh); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 3f19fcbf126d..1540b506e3e0 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -367,7 +367,8 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_nest_end(skb, mp_head); } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; errout: nlmsg_cancel(skb, nlh); diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index fa1464762d0d..c8133c07ceee 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -63,13 +63,9 @@ int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); - if (genlmsg_end(msg, hdr) < 0) - goto out; + genlmsg_end(msg, hdr); return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC); -out: - nlmsg_free(msg); - return -ENOBUFS; } struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, @@ -96,13 +92,9 @@ int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); - if (genlmsg_end(msg, hdr) < 0) - goto out; + genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); -out: - nlmsg_free(msg); - return -ENOBUFS; } static const struct genl_ops ieee8021154_ops[] = { diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 3c902e9516fb..9105265920fe 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -136,7 +136,8 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, } wpan_phy_put(phy); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: wpan_phy_put(phy); diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 7baf98b14611..1b9d25f6e898 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -65,7 +65,8 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, goto nla_put_failure; mutex_unlock(&phy->pib_lock); kfree(buf); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: mutex_unlock(&phy->pib_lock); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index a25b9bbd077b..a4daf91b8d0a 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -306,7 +306,8 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, goto nla_put_failure; finish: - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -489,7 +490,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt)) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 214882e7d6de..5f344eb3fc25 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1522,7 +1522,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, preferred, valid)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -1566,7 +1567,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) { + RTM_NEWADDR, NLM_F_MULTI) < 0) { rcu_read_unlock(); goto done; } @@ -1749,7 +1750,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2b7b5521b1b..265cb72b7c1b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1091,7 +1091,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_nest_end(skb, mp); } #endif - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e34dccbc4d70..81751f12645f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -203,7 +203,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk->icsk_ca_ops->get_info(sk, ext, skb); out: - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; errout: nlmsg_cancel(skb, nlh); @@ -271,7 +272,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, } #endif - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, @@ -758,7 +760,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, } #endif - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c8034587859d..9d78427652d2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2290,7 +2290,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (err < 0 && err != -ENOENT) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce112d0f2698..f6e43ca5e641 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2390,7 +2390,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index ed9c9a91851c..e5f41bd5ec1b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -886,7 +886,8 @@ static int tcp_metrics_dump_info(struct sk_buff *skb, if (tcp_metrics_fill_info(skb, tm) < 0) goto nla_put_failure; - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f7c8bbeb27b7..8975d9501d50 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -489,7 +489,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -619,7 +620,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) <= 0) { + -1) < 0) { rcu_read_unlock(); goto done; } @@ -635,7 +636,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) <= 0) + -1) < 0) goto done; else h++; @@ -646,7 +647,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) <= 0) + -1) < 0) goto done; else h++; @@ -4047,7 +4048,8 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) goto error; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; error: nlmsg_cancel(skb, nlh); @@ -4076,7 +4078,8 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, return -EMSGSIZE; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, @@ -4101,7 +4104,8 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return -EMSGSIZE; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } enum addr_type_t { @@ -4134,7 +4138,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWADDR, NLM_F_MULTI); - if (err <= 0) + if (err < 0) break; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } @@ -4151,7 +4155,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_GETMULTICAST, NLM_F_MULTI); - if (err <= 0) + if (err < 0) break; } break; @@ -4166,7 +4170,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_GETANYCAST, NLM_F_MULTI); - if (err <= 0) + if (err < 0) break; } break; @@ -4638,7 +4642,8 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, goto nla_put_failure; nla_nest_end(skb, protoinfo); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -4670,7 +4675,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWLINK, NLM_F_MULTI) <= 0) + RTM_NEWLINK, NLM_F_MULTI) < 0) goto out; cont: idx++; @@ -4747,7 +4752,8 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, ci.valid_time = ntohl(pinfo->valid); if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index fd0dc47f471d..e43e79d0a612 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -490,7 +490,8 @@ static int ip6addrlbl_fill(struct sk_buff *skb, return -EMSGSIZE; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, RTM_NEWADDRLABEL, NLM_F_MULTI); - if (err <= 0) + if (err < 0) break; } idx++; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 03c520a4ebeb..53775ee7d376 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -277,7 +277,6 @@ static int fib6_dump_node(struct fib6_walker *w) w->leaf = rt; return 1; } - WARN_ON(res == 0); } w->leaf = NULL; return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 722669754bbf..34b682617f50 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2388,7 +2388,8 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, if (err < 0 && err != -ENOENT) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 34dcbb59df75..c60f15775c53 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2725,7 +2725,8 @@ static int rt6_fill_node(struct net *net, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 6b16598f31d5..b4e923f77954 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -390,7 +390,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla } out: - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); @@ -451,7 +452,7 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - tunnel, L2TP_CMD_TUNNEL_GET) <= 0) + tunnel, L2TP_CMD_TUNNEL_GET) < 0) goto out; ti++; @@ -752,7 +753,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl goto nla_put_failure; nla_nest_end(skb, nest); - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); @@ -816,7 +818,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - session, L2TP_CMD_SESSION_GET) <= 0) + session, L2TP_CMD_SESSION_GET) < 0) break; si++; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b8295a430a56..e55759056361 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2887,7 +2887,8 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb, if (ip_vs_genl_fill_service(skb, svc) < 0) goto nla_put_failure; - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); @@ -3079,7 +3080,8 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, if (ip_vs_genl_fill_dest(skb, dest) < 0) goto nla_put_failure; - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); @@ -3215,7 +3217,8 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) goto nla_put_failure; - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3b3ddb4fb9ee..70f697827b9b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -427,7 +427,8 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); @@ -971,7 +972,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); @@ -1707,7 +1709,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); @@ -2361,7 +2364,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; nla_nest_end(skb, desc); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); @@ -3035,7 +3039,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nla_nest_end(skb, nest); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); @@ -3324,7 +3329,8 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_trim(skb, nlh); diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index c2f2a53a4879..179625353cac 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -641,7 +641,8 @@ static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg) if (ret_val != 0) goto listall_cb_failure; - return genlmsg_end(cb_arg->skb, data); + genlmsg_end(cb_arg->skb, data); + return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index e66e977ef2fa..8b3b789c43c2 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -456,7 +456,8 @@ static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg) goto listall_cb_failure; cb_arg->seq++; - return genlmsg_end(cb_arg->skb, data); + genlmsg_end(cb_arg->skb, data); + return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); @@ -620,7 +621,8 @@ static int netlbl_mgmt_protocols_cb(struct sk_buff *skb, if (ret_val != 0) goto protocols_cb_failure; - return genlmsg_end(skb, data); + genlmsg_end(skb, data); + return 0; protocols_cb_failure: genlmsg_cancel(skb, data); diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 78a63c18779e..aec7994f78cf 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1163,7 +1163,8 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, goto list_cb_failure; cb_arg->seq++; - return genlmsg_end(cb_arg->skb, data); + genlmsg_end(cb_arg->skb, data); + return 0; list_cb_failure: genlmsg_cancel(cb_arg->skb, data); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index bb59a7ed0859..3ee63a3cff30 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -91,7 +91,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sk_diag_put_rings_cfg(sk, skb)) goto out_nlmsg_trim; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2e11061ef885..f52a7d5734cd 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -756,7 +756,8 @@ static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq, nla_nest_end(skb, nla_grps); } - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); @@ -796,7 +797,8 @@ static int ctrl_fill_mcgrp_info(struct genl_family *family, nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); - return genlmsg_end(skb, hdr); + genlmsg_end(skb, hdr); + return 0; nla_put_failure: genlmsg_cancel(skb, hdr); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 44989fc8cddf..be387e6219a0 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -102,7 +102,8 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, goto nla_put_failure; } - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -518,7 +519,8 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -908,7 +910,8 @@ static int nfc_genl_send_params(struct sk_buff *msg, nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: @@ -1247,8 +1250,7 @@ static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; - if (genlmsg_end(msg, hdr) < 0) - goto nla_put_failure; + genlmsg_end(msg, hdr); } return 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 8bda3cc12344..f45f1bf4422c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -799,7 +799,8 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, if (err) goto error; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; error: genlmsg_cancel(skb, ovs_header); @@ -1349,7 +1350,8 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); @@ -1723,7 +1725,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (err == -EMSGSIZE) goto error; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; nla_put_failure: err = -EMSGSIZE; diff --git a/net/packet/diag.c b/net/packet/diag.c index 92f2c7107eec..0ed68f0238bf 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -177,7 +177,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, PACKET_DIAG_FILTER)) goto out_nlmsg_trim; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index b64151ade6b3..54d766842c2b 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -121,7 +121,8 @@ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, ifm->ifa_index = dev->ifindex; if (nla_put_u8(skb, IFA_LOCAL, addr)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -190,7 +191,8 @@ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, if (nla_put_u8(skb, RTA_DST, dst) || nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -282,9 +284,13 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (addr_idx++ < addr_start_idx) continue; - if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE)) - goto out; + fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE); + /* fill_route() used to return > 0 (or negative errors) but + * never 0 - ignore the return value and just go out to + * call dumpit again from outside to preserve the behavior + */ + goto out; } out: diff --git a/net/unix/diag.c b/net/unix/diag.c index 86fa0f3b2caf..ef542fbca9fe 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -155,7 +155,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) goto out_nlmsg_trim; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 380784378df8..4ed9039bd5f9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1721,7 +1721,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, break; } finish: - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -2404,7 +2405,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -3825,7 +3827,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, sinfo->assoc_req_ies)) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -4555,7 +4558,8 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, nla_nest_end(msg, pinfoattr); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -5507,7 +5511,8 @@ static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb, nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG)) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -6577,7 +6582,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, nla_nest_end(msg, bss); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; fail_unlock_rcu: rcu_read_unlock(); @@ -6686,7 +6692,8 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, nla_nest_end(msg, infoattr); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -11025,7 +11032,8 @@ static int nl80211_send_scan_msg(struct sk_buff *msg, /* ignore errors and send incomplete event anyway */ nl80211_add_scan_req(msg, rdev); - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); @@ -11048,7 +11056,8 @@ nl80211_send_sched_scan_msg(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) goto nla_put_failure; - return genlmsg_end(msg, hdr); + genlmsg_end(msg, hdr); + return 0; nla_put_failure: genlmsg_cancel(msg, hdr); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8128594ab379..7de2ed9ec46d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1019,7 +1019,8 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, return err; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1121,7 +1122,8 @@ static int build_sadinfo(struct sk_buff *skb, struct net *net, return err; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1842,7 +1844,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; out_cancel: nlmsg_cancel(skb, nlh); @@ -2282,7 +2285,8 @@ static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m, goto out_cancel; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; out_cancel: nlmsg_cancel(skb, nlh); @@ -2490,7 +2494,8 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c) @@ -2712,7 +2717,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, return err; } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, @@ -2827,7 +2833,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, } upe->hard = !!hard; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) @@ -2986,7 +2993,8 @@ static int build_report(struct sk_buff *skb, u8 proto, return err; } } - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_send_report(struct net *net, u8 proto, @@ -3031,7 +3039,8 @@ static int build_mapping(struct sk_buff *skb, struct xfrm_state *x, um->old_sport = x->encap->encap_sport; um->reqid = x->props.reqid; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; } static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, -- cgit v1.2.3 From 996636ddae5cab8883bd76b996cd4f2ea9a152be Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 16 Jan 2015 20:28:06 +0100 Subject: futex: Fix argument handling in futex_lock_pi() calls This patch fixes two separate buglets in calls to futex_lock_pi(): * Eliminate unused 'detect' argument * Change unused 'timeout' argument of FUTEX_TRYLOCK_PI to NULL The 'detect' argument of futex_lock_pi() seems never to have been used (when it was included with the initial PI mutex implementation in Linux 2.6.18, all checks against its value were disabled by ANDing against 0 (i.e., if (detect... && 0)), and with commit 778e9a9c3e7193ea9f434f382947155ffb59c755, any mention of this argument in futex_lock_pi() went way altogether. Its presence now serves only to confuse readers of the code, by giving the impression that the futex() FUTEX_LOCK_PI operation actually does use the 'val' argument. This patch removes the argument. The futex_lock_pi() call that corresponds to FUTEX_TRYLOCK_PI includes 'timeout' as one of its arguments. This misleads the reader into thinking that the FUTEX_TRYLOCK_PI operation does employ timeouts for some sensible purpose; but it does not. Indeed, it cannot, because the checks at the start of sys_futex() exclude FUTEX_TRYLOCK_PI from the set of operations that do copy_from_user() on the timeout argument. So, in the FUTEX_TRYLOCK_PI futex_lock_pi() call it would be simplest to change 'timeout' to 'NULL'. This patch does that. Signed-off-by: Michael Kerrisk Reviewed-by: Darren Hart Link: http://lkml.kernel.org/r/54B96646.8010200@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 63678b573d61..4eeb63de7e54 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - return futex_lock_pi(uaddr, flags, val, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, 0, timeout, 1); + return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -- cgit v1.2.3 From c772be52319de9756fd82f36d37a6d3e003441e3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:04 +1030 Subject: param: fix uninitialized read with CONFIG_DEBUG_LOCK_ALLOC ignore_lockdep is uninitialized, and sysfs_attr_init() doesn't initialize it, so memset to 0. Reported-by: Huang Ying Cc: Eric W. Biederman Signed-off-by: Rusty Russell --- kernel/params.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index bd65d136a470..728e05b167de 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -642,6 +642,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ + memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; -- cgit v1.2.3 From d453cded05ee219b77815ea194dc36efa5398bca Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:04 +1030 Subject: module_arch_freeing_init(): new hook for archs before module->module_init freed. Archs have been abusing module_free() to clean up their arch-specific allocations. Since module_free() is also (ab)used by BPF and trace code, let's keep it to simple allocations, and provide a hook called before that. This means that avr32, ia64, parisc and s390 no longer need to implement their own module_free() at all. avr32 doesn't need module_finalize() either. Signed-off-by: Rusty Russell Cc: Chris Metcalf Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-kernel@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-s390@vger.kernel.org --- arch/avr32/kernel/module.c | 13 +------------ arch/ia64/kernel/module.c | 6 ++---- arch/parisc/kernel/module.c | 6 +----- arch/s390/kernel/module.c | 10 +++------- arch/tile/kernel/module.c | 2 +- include/linux/moduleloader.h | 2 ++ kernel/module.c | 7 +++++++ 7 files changed, 17 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c index 2c9412908024..164efa009e5b 100644 --- a/arch/avr32/kernel/module.c +++ b/arch/avr32/kernel/module.c @@ -19,12 +19,10 @@ #include #include -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { vfree(mod->arch.syminfo); mod->arch.syminfo = NULL; - - vfree(module_region); } static inline int check_rela(Elf32_Rela *rela, struct module *module, @@ -291,12 +289,3 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, return ret; } - -int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, - struct module *module) -{ - vfree(module->arch.syminfo); - module->arch.syminfo = NULL; - - return 0; -} diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 24603be24c14..29754aae5177 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -305,14 +305,12 @@ plt_target (struct plt_entry *plt) #endif /* !USE_BRL */ void -module_free (struct module *mod, void *module_region) +module_arch_freeing_init (struct module *mod) { - if (mod && mod->arch.init_unw_table && - module_region == mod->module_init) { + if (mod->arch.init_unw_table) { unw_remove_unwind_table(mod->arch.init_unw_table); mod->arch.init_unw_table = NULL; } - vfree(module_region); } /* Have we already seen one of these relocations? */ diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 50dfafc3f2c1..5822e8e200e6 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -298,14 +298,10 @@ static inline unsigned long count_stubs(const Elf_Rela *rela, unsigned long n) } #endif - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { kfree(mod->arch.section); mod->arch.section = NULL; - - vfree(module_region); } /* Additional bytes needed in front of individual sections */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b89b59158b95..409d152585be 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -55,14 +55,10 @@ void *module_alloc(unsigned long size) } #endif -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { - if (mod) { - vfree(mod->arch.syminfo); - mod->arch.syminfo = NULL; - } - vfree(module_region); + vfree(mod->arch.syminfo); + mod->arch.syminfo = NULL; } static void check_rela(Elf_Rela *rela, struct module *me) diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 96447c9160a0..62a597e810d6 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -83,7 +83,7 @@ void module_free(struct module *mod, void *module_region) 0, 0, 0, NULL, NULL, 0); /* - * FIXME: If module_region == mod->module_init, trim exception + * FIXME: Add module_arch_freeing_init to trim exception * table entries. */ } diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 7eeb9bbfb816..054eac853090 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -82,4 +82,6 @@ int module_finalize(const Elf_Ehdr *hdr, /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); +/* Any cleanup before freeing mod->module_init */ +void module_arch_freeing_init(struct module *mod); #endif diff --git a/kernel/module.c b/kernel/module.c index 3965511ae133..68be0b1f9e7f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1804,6 +1804,10 @@ void __weak module_arch_cleanup(struct module *mod) { } +void __weak module_arch_freeing_init(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1841,6 +1845,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -2930,6 +2935,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); } @@ -3055,6 +3061,7 @@ static int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif unset_module_init_ro_nx(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.2.3 From be1f221c0445a4157d177197c236f888d3581914 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:05 +1030 Subject: module: remove mod arg from module_free, rename module_memfree(). Nothing needs the module pointer any more, and the next patch will call it from RCU, where the module itself might no longer exist. Removing the arg is the safest approach. This just codifies the use of the module_alloc/module_free pattern which ftrace and bpf use. Signed-off-by: Rusty Russell Acked-by: Alexei Starovoitov Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Ralf Baechle Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Steven Rostedt Cc: x86@kernel.org Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Masami Hiramatsu Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: nios2-dev@lists.rocketboards.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Cc: netdev@vger.kernel.org --- arch/cris/kernel/module.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/nios2/kernel/module.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 4 ++-- arch/tile/kernel/module.c | 2 +- arch/x86/kernel/ftrace.c | 2 +- include/linux/moduleloader.h | 2 +- kernel/bpf/core.c | 2 +- kernel/kprobes.c | 2 +- kernel/module.c | 14 +++++++------- 11 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/cris/kernel/module.c b/arch/cris/kernel/module.c index 51123f985eb5..af04cb6b6dc9 100644 --- a/arch/cris/kernel/module.c +++ b/arch/cris/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) } /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { kfree(module_region); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 9fd6834a2172..5d6139390bf8 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1388,7 +1388,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c index cc924a38f22a..e2e3f13f98d5 100644 --- a/arch/nios2/kernel/module.c +++ b/arch/nios2/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) } /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { kfree(module_region); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 1ca125b9c226..d1916b577f2c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -699,7 +699,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f33e7c7a3bf7..7931eeeb649a 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -776,7 +776,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpb_jit_compile fatal error\n"); kfree(addrs); - module_free(NULL, image); + module_memfree(image); return; } memcpy(image + proglen, temp, ilen); @@ -822,7 +822,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 62a597e810d6..2305084c9b93 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -74,7 +74,7 @@ error: /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { vfree(module_region); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2142376dc8c6..8b7b0a51e742 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size) } static inline void tramp_free(void *tramp) { - module_free(NULL, tramp); + module_memfree(tramp); } #else /* Trampolines can only be created if modules are supported */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 054eac853090..f7556261fe3c 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -26,7 +26,7 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); void *module_alloc(unsigned long size); /* Free memory returned from module_alloc. */ -void module_free(struct module *mod, void *module_region); +void module_memfree(void *module_region); /* * Apply the given relocation to the (simplified) ELF. Return -error diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6594e457a25..a64e7a207d2b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - module_free(NULL, hdr); + module_memfree(hdr); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 06f58309fed2..ee619929cf90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) static void free_insn_page(void *page) { - module_free(NULL, page); + module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { diff --git a/kernel/module.c b/kernel/module.c index 68be0b1f9e7f..1f85fd5c89d3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1795,7 +1795,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif -void __weak module_free(struct module *mod, void *module_region) +void __weak module_memfree(void *module_region) { vfree(module_region); } @@ -1846,7 +1846,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); + module_memfree(mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1855,7 +1855,7 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); + module_memfree(mod->module_core); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2790,7 +2790,7 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_free(mod, mod->module_core); + module_memfree(mod->module_core); return -ENOMEM; } memset(ptr, 0, mod->init_size); @@ -2936,8 +2936,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); + module_memfree(mod->module_init); + module_memfree(mod->module_core); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3062,7 +3062,7 @@ static int do_init_module(struct module *mod) #endif unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); + module_memfree(mod->module_init); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; -- cgit v1.2.3 From c749637909eea5d4090c6f50b89c2c20b534a280 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:05 +1030 Subject: module: fix race in kallsyms resolution during module load success. The kallsyms routines (module_symbol_name, lookup_module_* etc) disable preemption to walk the modules rather than taking the module_mutex: this is because they are used for symbol resolution during oopses. This works because there are synchronize_sched() and synchronize_rcu() in the unload and failure paths. However, there's one case which doesn't have that: the normal case where module loading succeeds, and we free the init section. We don't want a synchronize_rcu() there, because it would slow down module loading: this bug was introduced in 2009 to speed module loading in the first place. Thus, we want to do the free in an RCU callback. We do this in the simplest possible way by allocating a new rcu_head: if we put it in the module structure we'd have to worry about that getting freed. Reported-by: Rui Xiang Signed-off-by: Rusty Russell --- kernel/module.c | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1f85fd5c89d3..ed4ec9c30bd2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2989,10 +2989,31 @@ static void do_mod_ctors(struct module *mod) #endif } +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct rcu_head rcu; + void *module_init; +}; + +static void do_free_init(struct rcu_head *head) +{ + struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); + module_memfree(m->module_init); + kfree(m); +} + /* This is where the real work happens */ static int do_init_module(struct module *mod) { int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->module_init; /* * We want to find out whether @mod uses async during init. Clear @@ -3005,18 +3026,7 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* - * Init routine failed: abort. Try to protect us from - * buggy refcounters. - */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; + goto fail_free_freeinit; } if (ret > 0) { pr_warn("%s: '%s'->init suspiciously returned %d, it should " @@ -3062,15 +3072,34 @@ static int do_init_module(struct module *mod) #endif unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; mod->init_text_size = 0; + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, + * we call synchronize_rcu/synchronize_sched, but we don't want + * to slow down the success path, so use actual RCU here. + */ + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; } static int may_init_module(void) -- cgit v1.2.3 From 32b7eb877165fdb29f1722071c6c64ced1789014 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 20 Jan 2015 12:49:35 +0100 Subject: livepatch: change ARCH_HAVE_LIVE_PATCHING to HAVE_LIVE_PATCHING Change ARCH_HAVE_LIVE_PATCHING to HAVE_LIVE_PATCHING in Kconfigs. HAVE_ bools are prevalent there and we should go with the flow. Suggested-by: Andrew Morton Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 2 +- kernel/livepatch/Kconfig | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 460b31b79938..29b095231276 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,7 +17,7 @@ config X86_64 depends on 64BIT select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF - select ARCH_HAVE_LIVE_PATCHING + select HAVE_LIVE_PATCHING ### Arch settings config X86 diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 66797aa597c0..347ee2221137 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,4 +1,4 @@ -config ARCH_HAVE_LIVE_PATCHING +config HAVE_LIVE_PATCHING bool help Arch supports kernel live patching @@ -9,7 +9,7 @@ config LIVE_PATCHING depends on MODULES depends on SYSFS depends on KALLSYMS_ALL - depends on ARCH_HAVE_LIVE_PATCHING + depends on HAVE_LIVE_PATCHING help Say Y here if you want to support kernel live patching. This option has no runtime impact until a kernel "patch" -- cgit v1.2.3 From 2fded7f44b8fcf79e274c3f0cfbd0298f95308f3 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 23 Dec 2014 16:39:54 -0500 Subject: audit: remove vestiges of vers_ops Should have been removed with commit 18900909 ("audit: remove the old depricated kernel interface"). Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 1 - kernel/auditfilter.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 93331929d643..b481779a8dee 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -46,7 +46,6 @@ struct audit_tree; struct sk_buff; struct audit_krule { - int vers_ops; u32 pflags; u32 flags; u32 listnr; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 103586e239a2..81c94d739e3f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_nofree; bufp = data->buf; - entry->rule.vers_ops = 2; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; @@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) return ERR_PTR(-ENOMEM); new = &entry->rule; - new->vers_ops = old->vers_ops; new->flags = old->flags; new->pflags = old->pflags; new->listnr = old->listnr; -- cgit v1.2.3 From 83a90bb1345767f0cb96d242fd8b9db44b2b0e17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Jan 2015 09:26:18 -0600 Subject: livepatch: enforce patch stacking semantics Only allow the topmost patch on the stack to be enabled or disabled, so that patches can't be removed or added in an arbitrary order. Suggested-by: Jiri Kosina Signed-off-by: Josh Poimboeuf Reviewed-by: Jiri Slaby Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3d9c00b5223a..2401e7f955d3 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -379,6 +379,11 @@ static int __klp_disable_patch(struct klp_patch *patch) struct klp_object *obj; int ret; + /* enforce stacking: only the last enabled patch can be disabled */ + if (!list_is_last(&patch->list, &klp_patches) && + list_next_entry(patch, list)->state == KLP_ENABLED) + return -EBUSY; + pr_notice("disabling patch '%s'\n", patch->mod->name); for (obj = patch->objs; obj->funcs; obj++) { @@ -435,6 +440,11 @@ static int __klp_enable_patch(struct klp_patch *patch) if (WARN_ON(patch->state != KLP_DISABLED)) return -EINVAL; + /* enforce stacking: only the first disabled patch can be enabled */ + if (patch->list.prev != &klp_patches && + list_prev_entry(patch, list)->state == KLP_DISABLED) + return -EBUSY; + pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); -- cgit v1.2.3 From 3c33f5b99d688deafd21d4a770303691c7c3a320 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Jan 2015 09:26:19 -0600 Subject: livepatch: support for repatching a function Add support for patching a function multiple times. If multiple patches affect a function, the function in the most recently enabled patch "wins". This enables a cumulative patch upgrade path, where each patch is a superset of previous patches. This requires restructuring the data a little bit. With the current design, where each klp_func struct has its own ftrace_ops, we'd have to unregister the old ops and then register the new ops, because FTRACE_OPS_FL_IPMODIFY prevents us from having two ops registered for the same function at the same time. That would leave a regression window where the function isn't patched at all (not good for a patch upgrade path). This patch replaces the per-klp_func ftrace_ops with a global klp_ops list, with one ftrace_ops per original function. A single ftrace_ops is shared between all klp_funcs which have the same old_addr. This allows the switch between function versions to happen instantaneously by updating the klp_ops struct's func_stack list. The winner is the klp_func at the top of the func_stack (front of the list). [ jkosina@suse.cz: turn WARN_ON() into WARN_ON_ONCE() in ftrace handler to avoid storm in pathological cases ] Signed-off-by: Josh Poimboeuf Reviewed-by: Jiri Slaby Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 4 +- kernel/livepatch/core.c | 170 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 121 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 950bc615842f..f14c6fb262b4 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -40,8 +40,8 @@ enum klp_state { * @old_addr: a hint conveying at what address the old function * can be found (optional, vmlinux patches only) * @kobj: kobject for sysfs resources - * @fops: ftrace operations structure * @state: tracks function-level patch application state + * @stack_node: list node for klp_ops func_stack list */ struct klp_func { /* external */ @@ -59,8 +59,8 @@ struct klp_func { /* internal */ struct kobject kobj; - struct ftrace_ops *fops; enum klp_state state; + struct list_head stack_node; }; /** diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 2401e7f955d3..bc05d390ce85 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -29,17 +29,53 @@ #include #include -/* - * The klp_mutex protects the klp_patches list and state transitions of any - * structure reachable from the patches list. References to any structure must - * be obtained under mutex protection. +/** + * struct klp_ops - structure for tracking registered ftrace ops structs + * + * A single ftrace_ops is shared between all enabled replacement functions + * (klp_func structs) which have the same old_addr. This allows the switch + * between function versions to happen instantaneously by updating the klp_ops + * struct's func_stack list. The winner is the klp_func at the top of the + * func_stack (front of the list). + * + * @node: node for the global klp_ops list + * @func_stack: list head for the stack of klp_func's (active func is on top) + * @fops: registered ftrace ops struct */ +struct klp_ops { + struct list_head node; + struct list_head func_stack; + struct ftrace_ops fops; +}; +/* + * The klp_mutex protects the global lists and state transitions of any + * structure reachable from them. References to any structure must be obtained + * under mutex protection (except in klp_ftrace_handler(), which uses RCU to + * ensure it gets consistent data). + */ static DEFINE_MUTEX(klp_mutex); + static LIST_HEAD(klp_patches); +static LIST_HEAD(klp_ops); static struct kobject *klp_root_kobj; +static struct klp_ops *klp_find_ops(unsigned long old_addr) +{ + struct klp_ops *ops; + struct klp_func *func; + + list_for_each_entry(ops, &klp_ops, node) { + func = list_first_entry(&ops->func_stack, struct klp_func, + stack_node); + if (func->old_addr == old_addr) + return ops; + } + + return NULL; +} + static bool klp_is_module(struct klp_object *obj) { return obj->name; @@ -267,16 +303,28 @@ static int klp_write_object_relocations(struct module *pmod, static void notrace klp_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, + struct ftrace_ops *fops, struct pt_regs *regs) { - struct klp_func *func = ops->private; + struct klp_ops *ops; + struct klp_func *func; + + ops = container_of(fops, struct klp_ops, fops); + + rcu_read_lock(); + func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, + stack_node); + rcu_read_unlock(); + + if (WARN_ON_ONCE(!func)) + return; klp_arch_set_pc(regs, (unsigned long)func->new_func); } static int klp_disable_func(struct klp_func *func) { + struct klp_ops *ops; int ret; if (WARN_ON(func->state != KLP_ENABLED)) @@ -285,16 +333,28 @@ static int klp_disable_func(struct klp_func *func) if (WARN_ON(!func->old_addr)) return -EINVAL; - ret = unregister_ftrace_function(func->fops); - if (ret) { - pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", - func->old_name, ret); - return ret; - } + ops = klp_find_ops(func->old_addr); + if (WARN_ON(!ops)) + return -EINVAL; - ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0); - if (ret) - pr_warn("function unregister succeeded but failed to clear the filter\n"); + if (list_is_singular(&ops->func_stack)) { + ret = unregister_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + return ret; + } + + ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + if (ret) + pr_warn("function unregister succeeded but failed to clear the filter\n"); + + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + } else { + list_del_rcu(&func->stack_node); + } func->state = KLP_DISABLED; @@ -303,6 +363,7 @@ static int klp_disable_func(struct klp_func *func) static int klp_enable_func(struct klp_func *func) { + struct klp_ops *ops; int ret; if (WARN_ON(!func->old_addr)) @@ -311,22 +372,50 @@ static int klp_enable_func(struct klp_func *func) if (WARN_ON(func->state != KLP_DISABLED)) return -EINVAL; - ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0); - if (ret) { - pr_err("failed to set ftrace filter for function '%s' (%d)\n", - func->old_name, ret); - return ret; - } + ops = klp_find_ops(func->old_addr); + if (!ops) { + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->fops.func = klp_ftrace_handler; + ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; + + list_add(&ops->node, &klp_ops); + + INIT_LIST_HEAD(&ops->func_stack); + list_add_rcu(&func->stack_node, &ops->func_stack); + + ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + goto err; + } + + ret = register_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + goto err; + } + - ret = register_ftrace_function(func->fops); - if (ret) { - pr_err("failed to register ftrace handler for function '%s' (%d)\n", - func->old_name, ret); - ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0); } else { - func->state = KLP_ENABLED; + list_add_rcu(&func->stack_node, &ops->func_stack); } + func->state = KLP_ENABLED; + + return ret; + +err: + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); return ret; } @@ -582,10 +671,6 @@ static struct kobj_type klp_ktype_patch = { static void klp_kobj_release_func(struct kobject *kobj) { - struct klp_func *func; - - func = container_of(kobj, struct klp_func, kobj); - kfree(func->fops); } static struct kobj_type klp_ktype_func = { @@ -642,28 +727,11 @@ static void klp_free_patch(struct klp_patch *patch) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - struct ftrace_ops *ops; - int ret; - - ops = kzalloc(sizeof(*ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - ops->private = func; - ops->func = klp_ftrace_handler; - ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; - func->fops = ops; + INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; - ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, - obj->kobj, func->old_name); - if (ret) { - kfree(func->fops); - return ret; - } - - return 0; + return kobject_init_and_add(&func->kobj, &klp_ktype_func, + obj->kobj, func->old_name); } /* parts of the initialization that is done only when the object is loaded */ -- cgit v1.2.3 From dbed7ddab967550d1181633e8ac7905808f29a94 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Jan 2015 16:07:55 -0600 Subject: livepatch: fix uninitialized return value Fix a potentially uninitialized return value in klp_enable_func(). Signed-off-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc05d390ce85..9adf86bfbcd5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -410,7 +410,7 @@ static int klp_enable_func(struct klp_func *func) func->state = KLP_ENABLED; - return ret; + return 0; err: list_del_rcu(&func->stack_node); -- cgit v1.2.3 From d5db139ab3764640e0882a1746e7b9fdee33fd87 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 22 Jan 2015 11:13:14 +1030 Subject: module: make module_refcount() a signed integer. James Bottomley points out that it will be -1 during unload. It's only used for diagnostics, so let's not hide that as it could be a clue as to what's gone wrong. Cc: Jason Wessel Acked-and-documention-added-by: James Bottomley Reviewed-by: Masami Hiramatsu Signed-off-by: Rusty Russell --- include/linux/module.h | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/module.c | 17 +++++++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index ebfb0e153c6a..b653d7c0a05a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -444,7 +444,7 @@ extern void __module_put_and_exit(struct module *mod, long code) #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD -unsigned long module_refcount(struct module *mod); +int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x)) void symbol_put_addr(void *addr); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..2934889f2cce 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1979,7 +1979,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); + kdb_printf("%4d ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/module.c b/kernel/module.c index ed4ec9c30bd2..d856e96a3cce 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced) return 0; } -unsigned long module_refcount(struct module *mod) +/** + * module_refcount - return the refcount or -1 if unloading + * + * @mod: the module we're checking + * + * Returns: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) { - return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %lu ", module_refcount(mod)); + seq_printf(m, " %i ", module_refcount(mod)); /* * Always include a trailing , so userspace can differentiate @@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } static struct module_attribute modinfo_refcnt = -- cgit v1.2.3 From 3c606d35fe9766ede86a45273a628b320be13b45 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 22 Jan 2015 10:19:43 -0500 Subject: cgroup: prevent mount hang due to memory controller lifetime Since b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups"), re-mounting the memory controller after using it is very likely to hang. The cgroup core assumes that any remaining references after deleting a cgroup are temporary in nature, and synchroneously waits for them, but the above-mentioned commit has left-over page cache pin its css until it is reclaimed naturally. That being said, swap entries and charged kernel memory have been doing the same indefinite pinning forever, the bug is just more likely to trigger with left-over page cache. Reparenting kernel memory is highly impractical, which leaves changing the cgroup assumptions to reflect this: once a controller has been mounted and used, it has internal state that is independent from mount and cgroup lifetime. It can be unmounted and remounted, but it can't be reconfigured during subsequent mounts. Don't offline the controller root as long as there are any children, dead or alive. A remount will no longer wait for these old references to drain, it will simply mount the persistent controller state again. Reported-by: "Suzuki K. Poulose" Reported-by: Will Deacon Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bb263d0caab3..04cfe8ace520 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) * * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self) || + if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else -- cgit v1.2.3 From 3efb5f21a36fbddd524cffe36426a84622ce580e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 11:28:28 -0500 Subject: tracing: Remove unneeded includes of debugfs.h and fs.h The creation of tracing files and directories is for the most part encapsulated in helper functions in trace.c. Other files do not need to include debugfs.h or fs.h, as they may have needed to in the past. Remove them from the files that do not need them. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 -- kernel/trace/trace_branch.c | 1 - kernel/trace/trace_export.c | 2 -- kernel/trace/trace_irqsoff.c | 2 -- kernel/trace/trace_nop.c | 2 -- kernel/trace/trace_printk.c | 2 -- kernel/trace/trace_sched_switch.c | 2 -- kernel/trace/trace_sched_wakeup.c | 2 -- kernel/trace/trace_stack.c | 2 -- 9 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7a4104cb95cb..96079180de3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include /* for self test */ @@ -23,7 +22,6 @@ #include #include #include -#include #include diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7d6e2afde669..57cbf1efdd44 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4ddde28a81a..12e2b99be862 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -6,12 +6,10 @@ #include #include #include -#include #include #include #include #include -#include #include "trace_output.h" diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9bb104f748d0..8523ea345f2b 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -10,11 +10,9 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include -#include #include #include #include -#include #include "trace.h" diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index fcf0a9e48916..8bb2071474dd 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -6,8 +6,6 @@ */ #include -#include -#include #include #include "trace.h" diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index c4e70b6bd7fa..7ee4b5cc1ce5 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -5,7 +5,6 @@ * */ #include -#include #include #include #include @@ -15,7 +14,6 @@ #include #include #include -#include #include "trace.h" diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 2e293beb186e..419ca37e72c9 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -5,8 +5,6 @@ * */ #include -#include -#include #include #include #include diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 8fb84b362816..d6e1003724e9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -10,8 +10,6 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include -#include -#include #include #include #include diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 16eddb308c33..e80927b88eb0 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -7,12 +7,10 @@ #include #include #include -#include #include #include #include #include -#include #include -- cgit v1.2.3 From 14a5ae40f0def33a422a45b2ed09198adb7bf11c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 11:14:16 -0500 Subject: tracing: Use IS_ERR() check for return value of tracing_init_dentry() tracing_init_dentry() will soon return NULL as a valid pointer for the top level tracing directroy. NULL can not be used as an error value. Instead, switch to ERR_PTR() and check the return status with IS_ERR(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_stack.c | 2 +- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 929a733d302e..80c9d34540dd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5419,7 +5419,7 @@ static __init int ftrace_init_debugfs(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; ftrace_init_dyn_debugfs(d_tracer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7669b1f3234e..acd27555dc5b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5820,7 +5820,7 @@ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) return tr->dir; if (!debugfs_initialized()) - return NULL; + return ERR_PTR(-ENODEV); if (tr->flags & TRACE_ARRAY_FL_GLOBAL) tr->dir = debugfs_create_dir("tracing", NULL); @@ -5844,7 +5844,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + if (IS_ERR(d_tracer)) return NULL; tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); @@ -6047,7 +6047,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) return tr->options; d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + if (IS_ERR(d_tracer)) return NULL; tr->options = debugfs_create_dir("options", d_tracer); @@ -6538,7 +6538,7 @@ static __init int tracer_init_debugfs(void) trace_access_lock_init(); d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; init_tracer_debugfs(&global_trace, d_tracer); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 366a78a3e61e..4ff8c1394017 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2490,7 +2490,7 @@ static __init int event_trace_init(void) return -ENODEV; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ba476009e5de..2d25ad1526bb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("max_graph_depth", 0644, d_tracer, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5edb518be345..b4a00def88f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void) return -EINVAL; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; entry = debugfs_create_file("kprobe_events", 0644, d_tracer, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7ee4b5cc1ce5..36c1455b7567 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -347,7 +347,7 @@ static __init int init_trace_printk_function_export(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("printk_formats", 0444, d_tracer, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e80927b88eb0..c3e4fcfddd45 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -460,7 +460,7 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("stack_max_size", 0644, d_tracer, diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 7af67360b330..75e19e86c954 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -276,7 +276,7 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); - if (!d_tracing) + if (IS_ERR(d_tracing)) return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8520acc34b18..5f0eba9e5e6b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("uprobe_events", 0644, d_tracer, -- cgit v1.2.3 From e9d1b4f3c60997fe197bf0243cb4a41a44387a88 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 8 Jan 2015 14:30:22 -0800 Subject: x86, mpx: Strictly enforce empty prctl() args Description from Michael Kerrisk. He suggested an identical patch to one I had already coded up and tested. commit fe3d197f8431 "x86, mpx: On-demand kernel allocation of bounds tables" added two new prctl() operations, PR_MPX_ENABLE_MANAGEMENT and PR_MPX_DISABLE_MANAGEMENT. However, no checks were included to ensure that unused arguments are zero, as is done in many existing prctl()s and as should be done for all new prctl()s. This patch adds the required checks. Suggested-by: Andy Lutomirski Suggested-by: Michael Kerrisk Signed-off-by: Dave Hansen Cc: Dave Hansen Link: http://lkml.kernel.org/r/20150108223022.7F56FD13@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- kernel/sys.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a8c9f5a7dda6..ea9c88109894 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_ENABLE_MANAGEMENT(me); break; case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_DISABLE_MANAGEMENT(me); break; default: -- cgit v1.2.3 From 4bee96860a65c3a62d332edac331b3cf936ba3ad Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 31 Jul 2014 11:30:17 +0800 Subject: smpboot: Add missing get_online_cpus() in smpboot_register_percpu_thread() The following race exists in the smpboot percpu threads management: CPU0 CPU1 cpu_up(2) get_online_cpus(); smpboot_create_threads(2); smpboot_register_percpu_thread(); for_each_online_cpu(); __smpboot_create_thread(); __cpu_up(2); This results in a missing per cpu thread for the newly onlined cpu2 and in a NULL pointer dereference on a consecutive offline of that cpu. Proctect smpboot_register_percpu_thread() with get_online_cpus() to prevent that. [ tglx: Massaged changelog and removed the change in smpboot_unregister_percpu_thread() because that's an optimization and therefor not stable material. ] Signed-off-by: Lai Jiangshan Cc: Thomas Gleixner Cc: Rusty Russell Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: David Rientjes Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1406777421-12830-1-git-send-email-laijs@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f032fb5284e3..40190f28db35 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); -- cgit v1.2.3 From 9bc7491906b4113b4c5ae442157c7dfc4e10cd14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jan 2015 21:24:10 +0100 Subject: hrtimer: Prevent stale expiry time in hrtimer_interrupt() hrtimer_interrupt() has the following subtle issue: hrtimer_interrupt() lock(cpu_base); expires_next = KTIME_MAX; expire_timers(CLOCK_MONOTONIC); expires = get_next_timer(CLOCK_MONOTONIC); if (expires < expires_next) expires_next = expires; expire_timers(CLOCK_REALTIME); unlock(cpu_base); wakeup() hrtimer_start(CLOCK_MONOTONIC, newtimer); lock(cpu_base(); expires = get_next_timer(CLOCK_REALTIME); if (expires < expires_next) expires_next = expires; So because we already evaluated the next expiring timer of CLOCK_MONOTONIC we ignore that the expiry time of newtimer might be earlier than the overall next expiry time in hrtimer_interrupt(). To solve this, remove the caching of the next expiry value from hrtimer_interrupt() and reevaluate all active clock bases for the next expiry value. To avoid another code duplication, create a shared evaluation function and use it for hrtimer_get_next_event(), hrtimer_force_reprogram() and hrtimer_interrupt(). There is another subtlety in this mechanism: While hrtimer_interrupt() is running, we want to avoid to touch the hardware device because we will reprogram it anyway at the end of hrtimer_interrupt(). This works nicely for hrtimers which get rearmed via the HRTIMER_RESTART mechanism, because we drop out when the callback on that CPU is running. But that fails, if a new timer gets enqueued like in the example above. This has another implication: While hrtimer_interrupt() is running we refuse remote enqueueing of timers - see hrtimer_interrupt() and hrtimer_check_target(). hrtimer_interrupt() tries to prevent this by setting cpu_base->expires to KTIME_MAX, but that fails if a new timer gets queued. Prevent both the hardware access and the remote enqueue explicitely. We can loosen the restriction on the remote enqueue now due to reevaluation of the next expiry value, but that needs a seperate patch. Folded in a fix from Vignesh Radhakrishnan. Reported-and-tested-by: Stanislav Fomichev Based-on-patch-by: Stanislav Fomichev Signed-off-by: Thomas Gleixner Cc: vigneshr@codeaurora.org Cc: john.stultz@linaro.org Cc: viresh.kumar@linaro.org Cc: fweisbec@gmail.com Cc: cl@linux.com Cc: stuart.w.hayes@gmail.com Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1501202049190.5526@nanos Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 + kernel/time/hrtimer.c | 108 ++++++++++++++++++++++-------------------------- 2 files changed, 52 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a036d058a249..05f6df1fdf5b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -170,6 +170,7 @@ enum hrtimer_base_type { * @clock_was_set: Indicates that clock was set from irq context. * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() + * @in_hrtirq: hrtimer_interrupt() is currently executing * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang * @nr_events: Total number of hrtimer interrupt events @@ -185,6 +186,7 @@ struct hrtimer_cpu_base { unsigned int clock_was_set; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; + int in_hrtirq; int hres_active; int hang_detected; unsigned long nr_events; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..b663653a5d5b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next.tv64 < 0) + expires_next.tv64 = 0; + return expires_next; +} +#endif + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -586,6 +592,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 >= cpu_base->expires_next.tv64) return 0; + /* + * When the target cpu of the timer is currently executing + * hrtimer_interrupt(), then we do not touch the clock event + * device. hrtimer_interrupt() will reevaluate all clock bases + * before reprogramming the device. + */ + if (cpu_base->in_hrtirq) + return 0; + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry @@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + ktime_t mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; - int i; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } + if (!hrtimer_hres_active()) + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), + ktime_get()); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock(&cpu_base->lock); entry_time = now = hrtimer_update_base(cpu_base); retry: - expires_next.tv64 = KTIME_MAX; + cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1291,28 +1291,20 @@ retry: * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < 0) - expires.tv64 = KTIME_MAX; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - } __run_hrtimer(timer, &basenow); } } - + /* Reevaluate the clock bases for the next expiry */ + expires_next = __hrtimer_get_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ -- cgit v1.2.3 From 89f703f0932341b316b2312581dacddba14b3876 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Jan 2015 22:24:00 +0100 Subject: X.509: shut up about included cert for silent build Every kernel build that includes X.509 support prints out a message like - Including cert signing_key.x509 This may be useful for some cases, but when doing automated build tests, it just means noise. To hide the message, this uses '$(kecho)' for printing the message, which means we still see it when building with V=1, but not at the normal level or when building with 'make -s'. Signed-off-by: Arnd Bergmann Signed-off-by: David Howells --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a59481a3fa6c..23e17a7e7a63 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,7 +142,7 @@ endif kernel/system_certificates.o: $(obj)/x509_certificate_list quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") targets += $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list -- cgit v1.2.3 From f5f4eda4c9b9e26ec7d7c8ce083e386016ffc0f8 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Fri, 5 Dec 2014 11:19:08 -0600 Subject: PM / QoS: Add debugfs support to view the list of constraints PM QoS requests are notoriously hard to debug and made even more so due to their highly dynamic nature. Having visibility into the internal data representation per constraint allows us to have much better appreciation of potential issues or bad usage by drivers in the system. So introduce for all classes of PM QoS, an entry in /sys/kernel/debug/pm_qos that shall show all the current requests as well as the snapshot of the value these requests boil down to. For example: ==> /sys/kernel/debug/pm_qos/cpu_dma_latency <== 1: 4444: Active 2: 2000000000: Default 3: 2000000000: Default 4: 2000000000: Default Type=Minimum, Value=4444, Requests: active=1 / total=4 ==> /sys/kernel/debug/pm_qos/memory_bandwidth <== Empty! ... The actual value listed will have their meaning based on the QoS it is on, the 'Type' indicates what logic it would use to collate the information - Minimum, Maximum, or Sum. Value is the collation of all requests. This interface also compares the values with the defaults for the QoS class and marks the ones that are currently active. Signed-off-by: Nishanth Menon Signed-off-by: Dave Gerlach Acked-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 5f4c006c4b1e..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include @@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } +static inline int pm_qos_get_value(struct pm_qos_constraints *c); +static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) +{ + struct pm_qos_object *qos = (struct pm_qos_object *)s->private; + struct pm_qos_constraints *c; + struct pm_qos_request *req; + char *type; + unsigned long flags; + int tot_reqs = 0; + int active_reqs = 0; + + if (IS_ERR_OR_NULL(qos)) { + pr_err("%s: bad qos param!\n", __func__); + return -EINVAL; + } + c = qos->constraints; + if (IS_ERR_OR_NULL(c)) { + pr_err("%s: Bad constraints on qos?\n", __func__); + return -EINVAL; + } + + /* Lock to ensure we have a snapshot */ + spin_lock_irqsave(&pm_qos_lock, flags); + if (plist_head_empty(&c->list)) { + seq_puts(s, "Empty!\n"); + goto out; + } + + switch (c->type) { + case PM_QOS_MIN: + type = "Minimum"; + break; + case PM_QOS_MAX: + type = "Maximum"; + break; + case PM_QOS_SUM: + type = "Sum"; + break; + default: + type = "Unknown"; + } + + plist_for_each_entry(req, &c->list, node) { + char *state = "Default"; + + if ((req->node).prio != c->default_value) { + active_reqs++; + state = "Active"; + } + tot_reqs++; + seq_printf(s, "%d: %d: %s\n", tot_reqs, + (req->node).prio, state); + } + + seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", + type, pm_qos_get_value(c), active_reqs, tot_reqs); + +out: + spin_unlock_irqrestore(&pm_qos_lock, flags); + return 0; +} + +static int pm_qos_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, pm_qos_dbg_show_requests, + inode->i_private); +} + +static const struct file_operations pm_qos_debug_fops = { + .open = pm_qos_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * pm_qos_update_target - manages the constraints list and calls the notifiers * if needed @@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); /* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos) +static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) { qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + if (d) { + (void)debugfs_create_file(qos->name, S_IRUGO, d, + (void *)qos, &pm_qos_debug_fops); + } + return misc_register(&qos->pm_qos_power_miscdev); } @@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void) { int ret = 0; int i; + struct dentry *d; BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + d = debugfs_create_dir("pm_qos", NULL); + if (IS_ERR_OR_NULL(d)) + d = NULL; + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i]); + ret = register_pm_qos_misc(pm_qos_array[i], d); if (ret < 0) { printk(KERN_ERR "pm_qos_param: %s setup failed\n", pm_qos_array[i]->name); -- cgit v1.2.3 From d78cb3680c8e95c100d5a351b8cfc7dd3c589b68 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Sun, 11 Jan 2015 23:27:38 +0100 Subject: PM / hibernate: Remove unused function Remove the function get_safe_write_buffer() that is not used anywhere. This was partially found by using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0c40c16174b4..8e75a5a1e9b4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2310,8 +2310,6 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static inline int get_safe_write_buffer(void) { return 0; } - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -- cgit v1.2.3 From 8b618628b2bf83512fc8df5e8672619d65adfdfb Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 3 Dec 2014 14:43:06 -0500 Subject: ktime: Optimize ktime_divns for constant divisors At least on ARM, do_div() is optimized to turn constant divisors into an inline multiplication by the reciprocal value at compile time. However this optimization is missed entirely whenever ktime_divns() is used and the slow out-of-line division code is used all the time. Let ktime_divns() use do_div() inline whenever the divisor is constant and small enough. This will make things like ktime_to_us() and ktime_to_ms() much faster. Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Nicolas Pitre Acked-by: Arnd Bergmann Signed-off-by: Nicolas Pitre Signed-off-by: John Stultz --- include/linux/ktime.h | 12 +++++++++++- kernel/time/hrtimer.c | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index c9d645ad98ff..411dd8bfe539 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -166,7 +166,17 @@ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) } #if BITS_PER_LONG < 64 -extern u64 ktime_divns(const ktime_t kt, s64 div); +extern u64 __ktime_divns(const ktime_t kt, s64 div); +static inline u64 ktime_divns(const ktime_t kt, s64 div) +{ + if (__builtin_constant_p(div) && !(div >> 32)) { + u64 ns = kt.tv64; + do_div(ns, div); + return ns; + } else { + return __ktime_divns(kt, div); + } +} #else /* BITS_PER_LONG < 64 */ # define ktime_divns(kt, div) (u64)((kt).tv64 / (div)) #endif diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..890535c41c2d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 ktime_divns(const ktime_t kt, s64 div) +u64 __ktime_divns(const ktime_t kt, s64 div) { u64 dclc; int sft = 0; @@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } -EXPORT_SYMBOL_GPL(ktime_divns); +EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* -- cgit v1.2.3 From d08c0cdd26d48751c15aa2b4479a410594fee9ac Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 8 Dec 2014 12:00:09 -0800 Subject: time: Expose getboottime64 for in-kernel uses Adds a timespec64 based getboottime64() implementation that can be used as we convert internal users of getboottime away from using timespecs. Cc: pang.xunlei Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: John Stultz --- include/linux/timekeeping.h | 16 ++++++++++++++-- kernel/time/timekeeping.c | 12 ++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 9b63d13ba82b..91480137aa39 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -33,6 +33,7 @@ extern time64_t ktime_get_real_seconds(void); extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); +extern void getboottime64(struct timespec64 *ts); #if BITS_PER_LONG == 64 /** @@ -72,6 +73,11 @@ static inline struct timespec get_monotonic_coarse(void) { return get_monotonic_coarse64(); } + +static inline void getboottime(struct timespec *ts) +{ + return getboottime64(ts); +} #else /** * Deprecated. Use do_settimeofday64(). @@ -129,9 +135,15 @@ static inline struct timespec get_monotonic_coarse(void) { return timespec64_to_timespec(get_monotonic_coarse64()); } -#endif -extern void getboottime(struct timespec *ts); +static inline void getboottime(struct timespec *ts) +{ + struct timespec64 ts64; + + getboottime64(&ts64); + *ts = timespec64_to_timespec(ts64); +} +#endif #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) #define ktime_get_real_ts64(ts) getnstimeofday64(ts) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6a931852082f..b124af259800 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1659,24 +1659,24 @@ out: } /** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set * - * Returns the wall-time of boot in a timespec. + * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ -void getboottime(struct timespec *ts) +void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - *ts = ktime_to_timespec(t); + *ts = ktime_to_timespec64(t); } -EXPORT_SYMBOL_GPL(getboottime); +EXPORT_SYMBOL_GPL(getboottime64); unsigned long get_seconds(void) { -- cgit v1.2.3 From 9a4a445e30f0b601ca2d9433274047cbf48ebf9e Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 22 Jan 2015 02:31:55 +0000 Subject: rtc: Convert rtc_set_ntp_time() to use timespec64 rtc_set_ntp_time() uses timespec which is y2038-unsafe, so modify to use timespec64 which is y2038-safe, then replace rtc_time_to_tm() with rtc_time64_to_tm(). Also adjust all its call sites(only NTP uses it) accordingly. Cc: pang.xunlei Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Xunlei Pang Signed-off-by: John Stultz --- drivers/rtc/systohc.c | 6 +++--- include/linux/rtc.h | 2 +- kernel/time/ntp.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c index bf3e242ccc5c..eb71872d0361 100644 --- a/drivers/rtc/systohc.c +++ b/drivers/rtc/systohc.c @@ -20,16 +20,16 @@ * * If temporary failure is indicated the caller should try again 'soon' */ -int rtc_set_ntp_time(struct timespec now) +int rtc_set_ntp_time(struct timespec64 now) { struct rtc_device *rtc; struct rtc_time tm; int err = -ENODEV; if (now.tv_nsec < (NSEC_PER_SEC >> 1)) - rtc_time_to_tm(now.tv_sec, &tm); + rtc_time64_to_tm(now.tv_sec, &tm); else - rtc_time_to_tm(now.tv_sec + 1, &tm); + rtc_time64_to_tm(now.tv_sec + 1, &tm); rtc = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE); if (rtc) { diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 6d6be09a2fe5..dcad7ee0d746 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -161,7 +161,7 @@ extern void devm_rtc_device_unregister(struct device *dev, extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); -extern int rtc_set_ntp_time(struct timespec now); +extern int rtc_set_ntp_time(struct timespec64 now); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..183dfe2191c6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work) getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = timespec64_to_timespec(now); + struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(adjust); + fail = update_persistent_clock(timespec64_to_timespec(adjust)); #endif #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) -- cgit v1.2.3 From 4ebbda5251374d532ba8939de4241d769d1420b6 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 23 Jan 2015 20:12:06 +0800 Subject: hrtimer: Make __hrtimer_get_next_event() static kernel/time/hrtimer.c:444:9: sparse: symbol '__hrtimer_get_next_event' was not declared. Should it be static? Fixes: 9bc7491906b4 hrtimer: Prevent stale expiry time in hrtimer_interrupt() Signed-off-by: Fengguang Wu Cc: kbuild-all@01.org Link: http://lkml.kernel.org/r/20150123121206.GA4766@snb Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b67ebeaa447b..ce8221ac5f39 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -441,7 +441,7 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; -- cgit v1.2.3 From edb0ec0725bb9797ded0deaf172a6795e43795c8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 25 Jan 2015 19:50:34 +0100 Subject: kexec, Kconfig: spell "architecture" properly Grepping for "archicture" showed it actually twice! Most unusual spelling error, very interesting. :) Signed-off-by: Borislav Petkov Signed-off-by: Jiri Kosina --- kernel/kexec.c | 2 +- lib/Kconfig.debug | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..b2ed6a3d99fb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -2512,7 +2512,7 @@ static int kexec_apply_relocations(struct kimage *image) continue; /* - * Respective archicture needs to provide support for applying + * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ if (sechdrs[i].sh_type == SHT_RELA) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4e35a5d767ed..cfe8ca6717c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -620,7 +620,7 @@ config DEBUG_STACKOVERFLOW depends on DEBUG_KERNEL && HAVE_DEBUG_STACKOVERFLOW ---help--- Say Y here if you want to check for overflows of kernel, IRQ - and exception stacks (if your archicture uses them). This + and exception stacks (if your architecture uses them). This option will show detailed messages if free stack space drops below a certain limit. -- cgit v1.2.3 From 8ebe667c41e054384df19f2f382bc415badfaee1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Jan 2015 17:11:08 -0800 Subject: bpf: rcu lock must not be held when calling copy_to_user() BUG: sleeping function called from invalid context at mm/memory.c:3732 in_atomic(): 0, irqs_disabled(): 0, pid: 671, name: test_maps 1 lock held by test_maps/671: #0: (rcu_read_lock){......}, at: [<0000000000264190>] map_lookup_elem+0xe8/0x260 Call Trace: ([<0000000000115b7e>] show_trace+0x12e/0x150) [<0000000000115c40>] show_stack+0xa0/0x100 [<00000000009b163c>] dump_stack+0x74/0xc8 [<000000000017424a>] ___might_sleep+0x23a/0x248 [<00000000002b58e8>] might_fault+0x70/0xe8 [<0000000000264230>] map_lookup_elem+0x188/0x260 [<0000000000264716>] SyS_bpf+0x20e/0x840 Fix it by allocating temporary buffer to store map element value. Fixes: db20fd2b0108 ("bpf: add lookup/update/delete/iterate methods to BPF maps") Reported-by: Michael Holzheu Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 088ac0b1b106..536edc2be307 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct fd f = fdget(ufd); struct bpf_map *map; - void *key, *value; + void *key, *value, *ptr; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ENOENT; - rcu_read_lock(); - value = map->ops->map_lookup_elem(map, key); + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); if (!value) - goto err_unlock; + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, map->value_size) != 0) - goto err_unlock; + goto free_value; err = 0; -err_unlock: - rcu_read_unlock(); +free_value: + kfree(value); free_key: kfree(key); err_put: -- cgit v1.2.3 From 69a1c994cc540cc84469acf9952f72b899b38e8b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 27 Jan 2015 17:17:20 +0100 Subject: tracing: Remove newline from trace_printk warning banner Remove the output-confusing newline below: [ 0.191328] ********************************************************** [ 0.191493] ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** [ 0.191586] ** ** ... Link: http://lkml.kernel.org/r/1422375440-31970-1-git-send-email-bp@alien8.de Signed-off-by: Borislav Petkov [ added an extra '\n' by itself, to keep what it was suppose to do ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acd27555dc5b..f82e53b0e5a7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n**********************************************************\n"); + pr_warning("\n"); + pr_warning("**********************************************************\n"); pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warning("** **\n"); pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); -- cgit v1.2.3 From 81907478c4311a679849216abf723999184ab984 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Jan 2015 08:25:38 +0000 Subject: sched/fair: Avoid using uninitialized variable in preferred_group_nid() At least some gcc versions - validly afaict - warn about potentially using max_group uninitialized: There's no way the compiler can prove that the body of the conditional where it and max_faults get set/ updated gets executed; in fact, without knowing all the details of other scheduler code, I can't prove this either. Generally the necessary change would appear to be to clear max_group prior to entering the inner loop, and break out of the outer loop when it ends up being all clear after the inner one. This, however, seems inefficient, and afaict the same effect can be achieved by exiting the outer loop when max_faults is still zero after the inner loop. [ mingo: changed the solution to zero initialization: uninitialized_var() needs to die, as it's an actively dangerous construct: if in the future a known-proven-good piece of code is changed to have a true, buggy uninitialized variable, the compiler warning is then supressed... The better long term solution is to clean up the code flow, so that even simple minded compilers (and humans!) are able to read it without getting a headache. ] Signed-off-by: Jan Beulich Signed-off-by: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/54C2139202000078000588F7@mail.emea.novell.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40667cbf371b..fe331fc391f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1730,7 +1730,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) nodes = node_online_map; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; - nodemask_t max_group; + nodemask_t max_group = NODE_MASK_NONE; int a, b; /* Are there nodes at this distance from each other? */ -- cgit v1.2.3 From c3c87e770458aa004bd7ed3f29945ff436fd6511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Jan 2015 11:19:48 +0100 Subject: perf: Tighten (and fix) the grouping condition The fix from 9fc81d87420d ("perf: Fix events installation during moving group") was incomplete in that it failed to recognise that creating a group with events for different CPUs is semantically broken -- they cannot be co-scheduled. Furthermore, it leads to real breakage where, when we create an event for CPU Y and then migrate it to form a group on CPU X, the code gets confused where the counter is programmed -- triggered in practice as well by me via the perf fuzzer. Fix this by tightening the rules for creating groups. Only allow grouping of counters that can be co-scheduled in the same context. This means for the same task and/or the same cpu. Fixes: 9fc81d87420d ("perf: Fix events installation during moving group") Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150123125834.090683288@infradead.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ------ kernel/events/core.c | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f7a61ca4b39..664de5a4ec46 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -450,11 +450,6 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; -enum perf_event_context_type { - task_context, - cpu_context, -}; - /** * struct perf_event_context - event context structure * @@ -462,7 +457,6 @@ enum perf_event_context_type { */ struct perf_event_context { struct pmu *pmu; - enum perf_event_context_type type; /* * Protect the states of the events in the list, * nr_active, and the list: diff --git a/kernel/events/core.c b/kernel/events/core.c index 882f835a0d85..19efcf13375a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6776,7 +6776,6 @@ skip_type: __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; __perf_cpu_hrtimer_init(cpuctx, cpu); @@ -7420,7 +7419,19 @@ SYSCALL_DEFINE5(perf_event_open, * task or CPU context: */ if (move_group) { - if (group_leader->ctx->type != ctx->type) + /* + * Make sure we're both on the same task, or both + * per-cpu events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) goto err_context; } else { if (group_leader->ctx != ctx) -- cgit v1.2.3 From bb2bc55a694d45cdeda91b6f28ab2adec28125ef Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Jan 2015 04:53:55 +0100 Subject: sched: Fix crash if cpuset_cpumask_can_shrink() is passed an empty cpumask While creating an exclusive cpuset, we passed cpuset_cpumask_can_shrink() an empty cpumask (cur), and dl_bw_of(cpumask_any(cur)) made boom with it: CPU: 0 PID: 6942 Comm: shield.sh Not tainted 3.19.0-master #19 Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007 task: ffff880224552450 ti: ffff8800caab8000 task.ti: ffff8800caab8000 RIP: 0010:[] [] cpuset_cpumask_can_shrink+0x56/0xb0 [...] Call Trace: [] validate_change+0x18a/0x200 [] cpuset_write_resmask+0x3b7/0x720 [] cgroup_file_write+0x38/0x100 [] kernfs_fop_write+0x12a/0x180 [] vfs_write+0xb3/0x1d0 [] SyS_write+0x46/0xb0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Mike Galbraith Acked-by: Zefan Li Fixes: f82f80426f7a ("sched/deadline: Ensure that updates to exclusive cpusets don't break AC") Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422417235.5716.5.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..5c86687d22b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4642,6 +4642,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, struct dl_bw *cur_dl_b; unsigned long flags; + if (!cpumask_weight(cur)) + return ret; + rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); trial_cpus = cpumask_weight(trial); -- cgit v1.2.3 From 6ea22486ba46bcb665de36514094d74575cd1330 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 28 Jan 2015 12:48:53 +0000 Subject: tracing: Add array printing helper If a trace event contains an array, there is currently no standard way to format this for text output. Drivers are currently hacking around this by a) local hacks that use the trace_seq functionailty directly, or b) just not printing that information. For fixed size arrays, formatting of the elements can be open-coded, but this gets cumbersome for arrays of non-trivial size. These approaches result in non-standard content of the event format description delivered to userspace, so userland tools needs to be taught to understand and parse each array printing method individually. This patch implements a __print_array() helper that tracepoint implementations can use instead of reinventing it. A simple C-style syntax is used to delimit the array and its elements {like,this}. So that the helpers can be used with large static arrays as well as dynamic arrays, they take a pointer and element count: they can be used with __get_dynamic_array() for use with dynamic arrays. Link: http://lkml.kernel.org/r/1422449335-8289-2-git-send-email-javi.merino@arm.com Cc: Ingo Molnar Signed-off-by: Dave Martin Signed-off-by: Javi Merino Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ++++ include/trace/ftrace.h | 9 +++++++++ kernel/trace/trace_output.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0bebb5c348b8..5aa4a9269547 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -44,6 +44,10 @@ const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); +const char *ftrace_print_array_seq(struct trace_seq *p, + const void *buf, int buf_len, + size_t el_size); + struct trace_iterator; struct trace_event; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 139b5067345b..304901fc5f34 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -263,6 +263,14 @@ #undef __print_hex #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len) +#undef __print_array +#define __print_array(array, count, el_size) \ + ({ \ + BUILD_BUG_ON(el_size != 1 && el_size != 2 && \ + el_size != 4 && el_size != 8); \ + ftrace_print_array_seq(p, array, count, el_size); \ + }) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ @@ -674,6 +682,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask +#undef __print_array #undef TP_printk #define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b77b9a697619..692bf7184c8c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +const char * +ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len, + size_t el_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + const char *prefix = ""; + void *ptr = (void *)buf; + + trace_seq_putc(p, '{'); + + while (ptr < buf + buf_len) { + switch (el_size) { + case 1: + trace_seq_printf(p, "%s0x%x", prefix, + *(u8 *)ptr); + break; + case 2: + trace_seq_printf(p, "%s0x%x", prefix, + *(u16 *)ptr); + break; + case 4: + trace_seq_printf(p, "%s0x%x", prefix, + *(u32 *)ptr); + break; + case 8: + trace_seq_printf(p, "%s0x%llx", prefix, + *(u64 *)ptr); + break; + default: + trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size, + *(u8 *)ptr); + el_size = 1; + } + prefix = ","; + ptr += el_size; + } + + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_array_seq); + int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { -- cgit v1.2.3 From da194930ede6d87945786f72b1c36c7a4ed0f3a6 Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Wed, 28 Jan 2015 19:46:11 +0530 Subject: trace: Use 64-bit timekeeping The ring_buffer_producer uses 'struct timeval' to measure its start and end times. 'struct timeval' on 32-bit systems will have its tv_sec value overflow in year 2038 and beyond. This patch replaces struct timeval with 'ktime_t' which uses 64-bit representation for nanoseconds. Link: http://lkml.kernel.org/r/20150128141611.GA2701@tinar Suggested-by: Arnd Bergmann Suggested-by: Steven Rostedt Signed-off-by: Tina Ruchandani Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 3f9e328c30b5..13d945c0d03f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include struct rb_page { @@ -17,7 +17,7 @@ struct rb_page { }; /* run time and sleep time in seconds */ -#define RUN_TIME 10 +#define RUN_TIME 10ULL #define SLEEP_TIME 10 /* number of events for writer to wake up the reader */ @@ -212,8 +212,7 @@ static void ring_buffer_consumer(void) static void ring_buffer_producer(void) { - struct timeval start_tv; - struct timeval end_tv; + ktime_t start_time, end_time, timeout; unsigned long long time; unsigned long long entries; unsigned long long overruns; @@ -227,7 +226,8 @@ static void ring_buffer_producer(void) * make the system stall) */ trace_printk("Starting ring buffer hammer\n"); - do_gettimeofday(&start_tv); + start_time = ktime_get(); + timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC); do { struct ring_buffer_event *event; int *entry; @@ -244,7 +244,7 @@ static void ring_buffer_producer(void) ring_buffer_unlock_commit(buffer, event); } } - do_gettimeofday(&end_tv); + end_time = ktime_get(); cnt++; if (consumer && !(cnt % wakeup_interval)) @@ -264,7 +264,7 @@ static void ring_buffer_producer(void) cond_resched(); #endif - } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + } while (ktime_before(end_time, timeout) && !kill_test); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -280,9 +280,7 @@ static void ring_buffer_producer(void) wait_for_completion(&read_done); } - time = end_tv.tv_sec - start_tv.tv_sec; - time *= USEC_PER_SEC; - time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + time = ktime_us_delta(end_time, start_time); entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); -- cgit v1.2.3 From c0a80c0c27e5e65b180a25e6c4c2f7ef9e386cd3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 9 Jan 2015 13:06:33 +0100 Subject: ftrace: allow architectures to specify ftrace compile options If the kernel is compiled with function tracer support the -pg compile option is passed to gcc to generate extra code into the prologue of each function. This patch replaces the "open-coded" -pg compile flag with a CC_FLAGS_FTRACE makefile variable which architectures can override if a different option should be used for code generation. Acked-by: Steven Rostedt Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- Makefile | 6 +++++- kernel/Makefile | 4 ++-- kernel/events/Makefile | 2 +- kernel/locking/Makefile | 8 ++++---- kernel/sched/Makefile | 2 +- kernel/trace/Makefile | 4 ++-- lib/Makefile | 2 +- scripts/Makefile.build | 5 +++-- 8 files changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index fd80c6e9bc23..11c6fe8f708f 100644 --- a/Makefile +++ b/Makefile @@ -724,10 +724,14 @@ KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly) \ endif ifdef CONFIG_FUNCTION_TRACER +ifndef CC_FLAGS_FTRACE +CC_FLAGS_FTRACE := -pg +endif +export CC_FLAGS_FTRACE ifdef CONFIG_HAVE_FENTRY CC_USING_FENTRY := $(call cc-option, -mfentry -DCC_USING_FENTRY) endif -KBUILD_CFLAGS += -pg $(CC_USING_FENTRY) +KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_USING_FENTRY) KBUILD_AFLAGS += $(CC_USING_FENTRY) ifdef CONFIG_DYNAMIC_FTRACE ifdef CONFIG_HAVE_C_RECORDMCOUNT diff --git a/kernel/Makefile b/kernel/Makefile index a59481a3fa6c..13af308f2460 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \ ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_irq_work.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif # cond_syscall is currently not LTO compatible diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 103f5d147b2f..2925188f50ea 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = -pg +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif obj-y := core.o ring_buffer.o callchain.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..4caca3f7af53 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -2,10 +2,10 @@ obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c..46be87024875 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg +CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf83..9a7e0e60a1a8 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,11 +3,11 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation -CFLAGS_trace_selftest_dynamic.o = -pg +CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) obj-y += trace_selftest_dynamic.o endif endif diff --git a/lib/Makefile b/lib/Makefile index 3c3b30b9e020..f3f73e50519a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,7 +4,7 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 649ce6844033..01df30af4d4a 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -234,8 +234,9 @@ sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH "$(if $(part-of-module),1,0)" "$(@)"; recordmcount_source := $(srctree)/scripts/recordmcount.pl endif -cmd_record_mcount = \ - if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then \ +cmd_record_mcount = \ + if [ "$(findstring $(CC_FLAGS_FTRACE),$(_c_flags))" = \ + "$(CC_FLAGS_FTRACE)" ]; then \ $(sub_cmd_record_mcount) \ fi; endif -- cgit v1.2.3 From c9257f78b4f785d200737d60453fff51b57a2356 Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 12 Dec 2014 20:06:46 -0800 Subject: PM / sleep: export suspend_resume trace event Export the suspend_resume tracepoint so it can be used in loadable modules. Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- kernel/trace/power-traces.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 1c71382b283d..eb4220a132ec 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,6 @@ #define CREATE_TRACE_POINTS #include +EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -- cgit v1.2.3 From 80e3d87b2c5582db0ab5e39610ce3707d97ba409 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 12 Dec 2014 15:38:12 -0800 Subject: sched/rt: Reduce rq lock contention by eliminating locking of non-feasible target This patch adds checks that prevens futile attempts to move rt tasks to a CPU with active tasks of equal or higher priority. This reduces run queue lock contention and improves the performance of a well known OLTP benchmark by 0.7%. Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Cc: Shawn Bohrer Cc: Suruchi Kadu Cc: Doug Nelson Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421430374.2399.27.camel@schen9-desk2.jf.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6725e3c49660..f4d4b077eba0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1340,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); @@ -1617,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* -- cgit v1.2.3 From a18b5d01819235629289212ad428a5ee2b40f0d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Jan 2015 18:08:04 +0100 Subject: sched: Fix missing preemption opportunity If an interrupt fires in cond_resched(), between the call to __schedule() and the PREEMPT_ACTIVE count decrementation, and that interrupt sets TIF_NEED_RESCHED, the call to preempt_schedule_irq() will be ignored due to the PREEMPT_ACTIVE count. This kind of scenario, with irq preemption being delayed because it's interrupting a preempt-disabled area, is usually fixed up after preemption is re-enabled back with an explicit call to preempt_schedule(). This is what preempt_enable() does but a raw preempt count decrement as performed by __preempt_count_sub(PREEMPT_ACTIVE) doesn't handle delayed preemption check. Therefore when such a race happens, the rescheduling is going to be delayed until the next scheduler or preemption entrypoint. This can be a problem for scheduler latency sensitive workloads. Lets fix that by consolidating cond_resched() with preempt_schedule() internals. Reported-by: Linus Torvalds Reported-by: Ingo Molnar Original-patch-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1421946484-9298-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b591fe67b70..54dce019c0ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2884,6 +2884,21 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +static void preempt_schedule_common(void) +{ + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -2899,17 +2914,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; - do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); + preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); @@ -4209,17 +4214,10 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static void __cond_resched(void) -{ - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); -} - int __sched _cond_resched(void) { if (should_resched()) { - __cond_resched(); + preempt_schedule_common(); return 1; } return 0; @@ -4244,7 +4242,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) - __cond_resched(); + preempt_schedule_common(); else cpu_relax(); ret = 1; @@ -4260,7 +4258,7 @@ int __sched __cond_resched_softirq(void) if (should_resched()) { local_bh_enable(); - __cond_resched(); + preempt_schedule_common(); local_bh_disable(); return 1; } -- cgit v1.2.3 From ff6f2d29bd31cdfa1ac494a8b26d2af8ba887d59 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Wed, 21 Jan 2015 16:27:25 +0530 Subject: sched/idle: Add missing checks to the exit condition of cpu_idle_poll() cpu_idle_poll() is entered into when either the cpu_idle_force_poll is set or tick_check_broadcast_expired() returns true. The exit condition from cpu_idle_poll() is tif_need_resched(). However this does not take into account scenarios where cpu_idle_force_poll changes or tick_check_broadcast_expired() returns false, without setting the resched flag. So a cpu will be caught in cpu_idle_poll() needlessly, thereby wasting power. Add an explicit check on cpu_idle_force_poll and tick_check_broadcast_expired() to the exit condition of cpu_idle_poll() to avoid this. Signed-off-by: Preeti U Murthy Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: linuxppc-dev@lists.ozlabs.org Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150121105655.15279.59626.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e666..aaf1c1d5cf5d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!tif_need_resched() && + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); -- cgit v1.2.3 From 16b269436b7213ebc01dcfcc9dafa8535b676ccb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 19 Jan 2015 04:49:36 +0000 Subject: sched/deadline: Modify cpudl::free_cpus to reflect rd->online Currently, cpudl::free_cpus contains all CPUs during init, see cpudl_init(). When calling cpudl_find(), we have to add rd->span to avoid selecting the cpu outside the current root domain, because cpus_allowed cannot be depended on when performing clustered scheduling using the cpuset, see find_later_rq(). This patch adds cpudl_set_freecpu() and cpudl_clear_freecpu() for changing cpudl::free_cpus when doing rq_online_dl()/rq_offline_dl(), so we can avoid the rd->span operation when calling cpudl_find() in find_later_rq(). Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421642980-10045-1-git-send-email-pang.xunlei@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 28 ++++++++++++++++++++++++---- kernel/sched/cpudeadline.h | 2 ++ kernel/sched/deadline.c | 5 ++--- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 539ca3ce071b..fd9d3fb49d0d 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,7 +107,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { + if (later_mask && + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) && + cpumask_and(later_mask, later_mask, cpu_active_mask)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && @@ -185,6 +187,26 @@ out: raw_spin_unlock_irqrestore(&cp->lock, flags); } +/* + * cpudl_set_freecpu - Set the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_set_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_set_cpu(cpu, cp->free_cpus); +} + +/* + * cpudl_clear_freecpu - Clear the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_clear_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_clear_cpu(cpu, cp->free_cpus); +} + /* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context @@ -203,7 +225,7 @@ int cpudl_init(struct cpudl *cp) if (!cp->elements) return -ENOMEM; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { kfree(cp->elements); return -ENOMEM; } @@ -211,8 +233,6 @@ int cpudl_init(struct cpudl *cp) for_each_possible_cpu(i) cp->elements[i].idx = IDX_INVALID; - cpumask_setall(cp->free_cpus); - return 0; } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 020039bd1326..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b52092f2636d..e7b272233c5c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1165,9 +1165,6 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - cpumask_copy(later_mask, task_rq(task)->rd->span); - cpumask_and(later_mask, later_mask, cpu_active_mask); - cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1562,6 +1559,7 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } @@ -1573,6 +1571,7 @@ static void rq_offline_dl(struct rq *rq) dl_clear_overload(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } void init_sched_dl_class(void) -- cgit v1.2.3 From 00845eb968ead28007338b2bb852b8beef816583 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Feb 2015 12:23:32 -0800 Subject: sched: don't cause task state changes in nested sleep debugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report on nested sleep conditions, which we generally want to avoid because the inner sleeping operation can re-set the thread state to TASK_RUNNING, but that will then cause the outer sleep loop not actually sleep when it calls schedule. However, that's actually valid traditional behavior, with the inner sleep being some fairly rare case (like taking a sleeping lock that normally doesn't actually need to sleep). And the debug code would actually change the state of the task to TASK_RUNNING internally, which makes that kind of traditional and working code not work at all, because now the nested sleep doesn't just sometimes cause the outer one to not block, but will cause it to happen every time. In particular, it will cause the cardbus kernel daemon (pccardd) to basically busy-loop doing scheduling, converting a laptop into a heater, as reported by Bruno Prémont. But there may be other legacy uses of that nested sleep model in other drivers that are also likely to never get converted to the new model. This fixes both cases: - don't set TASK_RUNNING when the nested condition happens (note: even if WARN_ONCE() only _warns_ once, the return value isn't whether the warning happened, but whether the condition for the warning was true. So despite the warning only happening once, the "if (WARN_ON(..))" would trigger for every nested sleep. - in the cases where we knowingly disable the warning by using "sched_annotate_sleep()", don't change the task state (that is used for all core scheduling decisions), instead use '->task_state_change' that is used for the debugging decision itself. (Credit for the second part of the fix goes to Oleg Nesterov: "Can't we avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the suggested change to use 'task_state_change' as part of the test) Reported-and-bisected-by: Bruno Prémont Tested-by: Rafael J Wysocki Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner , Cc: Ilya Dryomov , Cc: Mike Galbraith Cc: Ingo Molnar Cc: Peter Hurley , Cc: Davidlohr Bueso , Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- kernel/sched/core.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5449d2f4a1ef..64ce58bee6f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -176,7 +176,7 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) -# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) +# define sched_annotate_sleep() (current->task_state_change = 0) #else static inline void ___might_sleep(const char *file, int line, int preempt_offset) { } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..e628cb11b560 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7292,13 +7292,12 @@ void __might_sleep(const char *file, int line, int preempt_offset) * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - if (WARN_ONCE(current->state != TASK_RUNNING, + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " "state=%lx set at [<%p>] %pS\n", current->state, (void *)current->task_state_change, - (void *)current->task_state_change)) - __set_current_state(TASK_RUNNING); + (void *)current->task_state_change); ___might_sleep(file, line, preempt_offset); } -- cgit v1.2.3 From fd979c0132074856975a6e79bc2226b99435ec5b Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 30 Jan 2015 13:45:57 -0800 Subject: perf: provide sysfs_show for struct perf_pmu_events_attr (struct perf_pmu_events_attr) is defined in include/linux/perf_event.h, but the only "show" for it is in x86 and contains x86 specific stuff. Make a generic one for those of us who are just using the event_str. Signed-off-by: Cody P Schafer Signed-off-by: Sukadev Bhattiprolu Acked-by: Jiri Olsa Signed-off-by: Michael Ellerman --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 486e84ccb1f9..58f59bdb590b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -897,6 +897,9 @@ struct perf_pmu_events_attr { const char *event_str; }; +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c1ee7f2bebc..934687f8d51b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8276,6 +8276,18 @@ void __init perf_event_init(void) != 1024); } +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + if (pmu_attr->event_str) + return sprintf(page, "%s\n", pmu_attr->event_str); + + return 0; +} + static int __init perf_event_sysfs_init(void) { struct pmu *pmu; -- cgit v1.2.3 From c602894814adc93589dde028182101818c5f938b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 26 Jan 2015 20:38:39 -0500 Subject: tracing: Make tracing_init_dentry_tr() static tracing_init_dentry_tr() is not used outside of trace.c, it should be static. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f82e53b0e5a7..2668a0d742ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5815,7 +5815,7 @@ static __init int register_snapshot_cmd(void) static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_init_dentry_tr(struct trace_array *tr) { if (tr->dir) return tr->dir; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0eddfeb05fee..dd8205a35760 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -542,7 +542,6 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; -- cgit v1.2.3 From 7eeafbcab47fe9860e5106286db83d60e3f35644 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 26 Jan 2015 21:00:48 -0500 Subject: tracing: Separate out initializing top level dir from instances The top level trace array is treated a little different than the instances, as it has to deal with more of the general tracing. The tr->dir is the tracing directory, which is an immutable dentry, where as the tr->dir of instances are the dentry that was created, and can be destroyed later. These should have different functions accessing them. As only tracing_init_dentry() deals with the top level array, fold the code for it into that function, and remove the trace_init_dentry_tr() that was also used by the instances to get their directory dentry. Add a tracing_get_dentry() to just get the tracing dir entry for instances as well as the top level array. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2668a0d742ee..5afce60e1b68 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5815,28 +5815,11 @@ static __init int register_snapshot_cmd(void) static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -static struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (tr->dir) - return tr->dir; - - if (!debugfs_initialized()) - return ERR_PTR(-ENODEV); - - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - tr->dir = debugfs_create_dir("tracing", NULL); - - if (!tr->dir) - pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return tr->dir; } -struct dentry *tracing_init_dentry(void) -{ - return tracing_init_dentry_tr(&global_trace); -} - static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; @@ -5844,7 +5827,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (tr->percpu_dir) return tr->percpu_dir; - d_tracer = tracing_init_dentry_tr(tr); + d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; @@ -6047,7 +6030,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (tr->options) return tr->options; - d_tracer = tracing_init_dentry_tr(tr); + d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; @@ -6532,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) } +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +struct dentry *tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; + + if (tr->dir) + return tr->dir; + + if (WARN_ON(!debugfs_initialized())) + return ERR_PTR(-ENODEV); + + tr->dir = debugfs_create_dir("tracing", NULL); + + if (!tr->dir) { + pr_warn_once("Could not create debugfs directory 'tracing'\n"); + return ERR_PTR(-ENOMEM); + } + + return tr->dir; +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -6772,7 +6782,6 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; - if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; -- cgit v1.2.3 From a64fc82c4f1598db024de7b3d79e7213379769ff Mon Sep 17 00:00:00 2001 From: Wonhong Kwon Date: Tue, 3 Feb 2015 17:22:00 +0900 Subject: PM / hibernate: exclude freed pages from allocated pages printout hibernate_preallocate_memory() prints out that how many pages are allocated, but it doesn't take into consideration the pages freed by free_unnecessary_pages(). Therefore, it always shows the count more than actually allocated. Signed-off-by: Wonhong Kwon [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 8e75a5a1e9b4..c24d5a23bf93 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, /** * free_unnecessary_pages - Release preallocated pages not needed for the image */ -static void free_unnecessary_pages(void) +static unsigned long free_unnecessary_pages(void) { - unsigned long save, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem, free; save = count_data_pages(); if (alloc_normal >= save) { @@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void) else to_free_normal = 0; } + free = to_free_normal + to_free_highmem; memory_bm_position_reset(©_bm); @@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void) swsusp_unset_page_free(page); __free_page(page); } + + return free; } /** @@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void) * pages in memory, but we have allocated more. Release the excessive * ones now. */ - free_unnecessary_pages(); + pages -= free_unnecessary_pages(); out: stop = ktime_get(); -- cgit v1.2.3 From 40767b0dc768060266d261b4a330164b4be53f7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2015 15:08:03 +0100 Subject: sched/deadline: Fix deadline parameter modification handling Commit 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") removed the hrtimer_try_cancel() function call out from init_dl_task_timer(), which gets called from __setparam_dl(). The result is that we can now re-init the timer while its active -- this is bad and corrupts timer state. Furthermore; changing the parameters of an active deadline task is tricky in that you want to maintain guarantees, while immediately effective change would allow one to circumvent the CBS guarantees -- this too is bad, as one (bad) task should not be able to affect the others. Rework things to avoid both problems. We only need to initialize the timer once, so move that to __sched_fork() for new tasks. Then make sure __setparam_dl() doesn't affect the current running state but only updates the parameters used to calculate the next scheduling period -- this guarantees the CBS functions as expected (albeit slightly pessimistic). This however means we need to make sure __dl_clear_params() needs to reset the active state otherwise new (and tasks flipping between classes) will not properly (re)compute their first instance. Todo: close class flipping CBS hole. Todo: implement delayed BW release. Reported-by: Luca Abeni Acked-by: Juri Lelli Tested-by: Luca Abeni Fixes: 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150128140803.GF23038@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 33 ++++++++++++++++++++++++++++----- kernel/sched/deadline.c | 3 ++- 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c86687d22b3..9e838095beb8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1814,6 +1814,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } /* @@ -1839,7 +1843,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + init_dl_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2049,6 +2053,9 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -3251,15 +3258,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b52092f2636d..726470d47f87 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1094,6 +1094,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1614,8 +1615,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p) static void switched_from_dl(struct rq *rq, struct task_struct *p) { + /* XXX we should retain the bw until 0-lag */ cancel_dl_timer(rq, p); - __dl_clear_params(p); /* -- cgit v1.2.3 From a7bebf488791aa1036f3e6629daf01d01f705dcb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 26 Nov 2014 08:44:01 +0800 Subject: sched/deadline: Fix hrtick for a non-leftmost task After update_curr_dl() the current task might not be the leftmost task anymore. In that case do not start a new hrtick for it. In this case NEED_RESCHED will be set and the next schedule will start the hrtick for the new task if and when appropriate. Signed-off-by: Wanpeng Li Acked-by: Juri Lelli [ Rewrote the changelog and comment. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-2-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e0e9c2986976..7b684f9341a5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + /* + * Even when we have runtime, update_curr_dl() might have resulted in us + * not being the leftmost task anymore. In that case NEED_RESCHED will + * be set and schedule() will start a new hrtick for the next task. + */ + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } -- cgit v1.2.3 From 1019a359d3dc4b64d0e1e5a5efcb725d5e83994d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Nov 2014 08:44:03 +0800 Subject: sched/deadline: Fix stale yield state When we fail to start the deadline timer in update_curr_dl(), we forget to clear ->dl_yielded, resulting in wrecked time keeping. Since the natural place to clear both ->dl_yielded and ->dl_throttled is in replenish_dl_entity(); both are after all waiting for that event; make it so. Luckily since 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") the task_on_rq_queued() condition in dl_task_timer() must be true, and can therefore call enqueue_task_dl() unconditionally. Reported-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-4-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7b684f9341a5..a027799ae130 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -536,23 +541,19 @@ again: sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (task_on_rq_queued(p)) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); #endif - } unlock: raw_spin_unlock(&rq->lock); @@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq) dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); -- cgit v1.2.3 From 75381608e8410a72ae8b4080849dc86b472c01fb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 26 Nov 2014 08:44:04 +0800 Subject: sched/deadline: Avoid pointless __setscheduler() There is no need to dequeue/enqueue and push/pull if there are no scheduling parameters changed for the DL class. Both fair and RT classes already check if parameters changed for them to avoid unnecessary overhead. This patch add the parameters changed test for the DL class in order to reduce overhead. Signed-off-by: Wanpeng Li [ Fixed up the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-5-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 50a5352f6205..d59652d60f86 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3417,6 +3417,20 @@ static bool check_same_owner(struct task_struct *p) return match; } +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) @@ -3545,7 +3559,7 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; - if (dl_policy(policy)) + if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; -- cgit v1.2.3 From 868933359a3bdda25b562e9d41bce7071edc1b08 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 26 Nov 2014 08:44:06 +0800 Subject: sched: Fix hrtick_start() on UP The commit 177ef2a6315e ("sched/deadline: Fix a precision problem in the microseconds range") forgot to change the UP version of hrtick_start(), do so now. Signed-off-by: Wanpeng Li Fixes: 177ef2a6315e ("sched/deadline: Fix a precision problem in the microseconds range") [ Fixed the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-7-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d59652d60f86..5091fd4feed8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -492,6 +492,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } -- cgit v1.2.3 From 9659e1eeee28f7025b6545934d644d19e9c6e603 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 19 Jan 2015 04:49:37 +0000 Subject: sched/deadline: Remove cpu_active_mask from cpudl_find() cpu_active_mask is rarely changed (only on hotplug), so remove this operation to gain a little performance. If there is a change in cpu_active_mask, rq_online_dl() and rq_offline_dl() should take care of it normally, so cpudl::free_cpus carries enough information for us. For the rare case when a task is put onto a dying cpu (which rq_offline_dl() can't handle in a timely fashion), it will be handled through _cpu_down()->...->multi_cpu_stop()->migration_call() ->migrate_tasks(), preventing the task from hanging on the dead cpu. Cc: Juri Lelli Signed-off-by: Xunlei Pang [peterz: changelog] Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1421642980-10045-2-git-send-email-pang.xunlei@linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fd9d3fb49d0d..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -108,8 +108,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) && - cpumask_and(later_mask, later_mask, cpu_active_mask)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && -- cgit v1.2.3 From bfd9b2b5f80e7289fdd50210afe4d9ca5952a865 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 Jan 2015 01:24:09 +0100 Subject: sched: Pull resched loop to __schedule() callers __schedule() disables preemption during its job and re-enables it afterward without doing a preemption check to avoid recursion. But if an event happens after the context switch which requires rescheduling, we need to check again if a task of a higher priority needs the CPU. A preempt irq can raise such a situation. To handle that, __schedule() loops on need_resched(). But preempt_schedule_*() functions, which call __schedule(), also loop on need_resched() to handle missed preempt irqs. Hence we end up with the same loop happening twice. Lets simplify that by attributing the need_resched() loop responsibility to all __schedule() callers. There is a risk that the outer loop now handles reschedules that used to be handled by the inner loop with the added overhead of caller details (inc/dec of PREEMPT_ACTIVE, irq save/restore) but assuming those inner rescheduling loop weren't too frequent, this shouldn't matter. Especially since the whole preemption path is now losing one loop in any case. Suggested-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1422404652-29067-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5091fd4feed8..b269e8a2a516 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2765,6 +2765,10 @@ again: * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space + * + * WARNING: all callers must re-check need_resched() afterward and reschedule + * accordingly in case an event triggered the need for rescheduling (such as + * an interrupt waking up a task) while preemption was disabled in __schedule(). */ static void __sched __schedule(void) { @@ -2773,7 +2777,6 @@ static void __sched __schedule(void) struct rq *rq; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -2840,8 +2843,6 @@ need_resched: post_schedule(rq); sched_preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) @@ -2861,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void) struct task_struct *tsk = current; sched_submit_work(tsk); - __schedule(); + do { + __schedule(); + } while (need_resched()); } EXPORT_SYMBOL(schedule); -- cgit v1.2.3 From 139b6fd26d85a65c4e0d2795b87b94f9505e5943 Mon Sep 17 00:00:00 2001 From: Sharon Dvir Date: Sun, 1 Feb 2015 23:47:32 +0200 Subject: sched/Documentation: Remove unneeded word The second 'mutex' shouldn't be there, it can't be about the mutex, as the mutex can't be freed, but unlocked, the memory where the mutex resides however, can be freed. Signed-off-by: Sharon Dvir Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422827252-31363-1-git-send-email-sharon.dvir1@mail.huji.ac.il Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 454195194d4a..5b49f652a3cf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); * The mutex must later on be released by the same task that * acquired it. Recursive locking is not allowed. The task * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with + * memory where the mutex resides must not be freed with * the mutex still locked. The mutex must first be initialized * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. -- cgit v1.2.3 From 51587bcf31d070119d37de6103543c807f5ccdb3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 19 Jan 2015 17:39:21 -0800 Subject: locking/mutex: Explicitly mark task as running after wakeup By the time we wake up and get the lock after being asleep in the slowpath, we better be running. As good practice, be explicit about this and avoid any mischief. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421717961.4903.11.camel@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c67a60b61625..57407062e209 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -587,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + __set_task_state(task, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current_thread_info()); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) -- cgit v1.2.3 From de30ec47302c101c7badc8fe687641fd75e596e7 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 17 Jan 2015 05:05:34 +0100 Subject: sched/completion: Remove unnecessary ->wait.lock serialization when reading completion state Signed-off-by: Nicholas Mc Guire Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421467534-22834-1-git-send-email-der.herr@hofr.at Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 607f852b4d04..9d1fe32da232 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -288,13 +288,6 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + return !!ACCESS_ONCE(x->done); } EXPORT_SYMBOL(completion_done); -- cgit v1.2.3 From 7c34e3180a01c800a40bc8535654d5735802fc1b Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 23 Jan 2015 12:41:47 +0100 Subject: sched/completion: Add lock-free checking of the blocking case The "thread would block" case can be checked without grabbing ->wait.lock. [ If the check does not return early then grab the lock and recheck. A memory barrier is not needed as complete() and complete_all() imply a barrier. The ACCESS_ONCE() is needed for calls in a loop that, if inlined, could optimize out the re-fetching of x->done. ] Signed-off-by: Nicholas Mc Guire Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422013307-13200-1-git-send-email-der.herr@hofr.at Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 9d1fe32da232..7052d3fd4e7b 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!ACCESS_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; -- cgit v1.2.3 From 73105994c57d06e40a33ab5a716db04e898b4c05 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 25 Jan 2015 23:36:04 -0800 Subject: locking/rwsem: Use task->state helpers Call __set_task_state() instead of assigning the new state directly. These interfaces also aid CONFIG_DEBUG_ATOMIC_SLEEP environments, keeping track of who last changed the state. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Jason Low Cc: Michel Lespinasse Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422257769-14083-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 2 +- kernel/locking/rwsem-xadd.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2c93571162cb..2555ae15ec14 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); } - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); out: ; } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7628c3fc37ca..2f7cc4076f50 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) schedule(); } - tsk->state = TASK_RUNNING; - + __set_task_state(tsk, TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); -- cgit v1.2.3 From afffc6c1805d98e08e778cddb644a666e78cfcfd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 1 Feb 2015 22:16:24 -0800 Subject: locking/rtmutex: Optimize setting task running after being blocked We explicitly mark the task running after returning from a __rt_mutex_slowlock() call, which does the actual sleeping via wait-wake-trylocking. As such, this patch does two things: (1) refactors the code so that setting current to TASK_RUNNING is done by __rt_mutex_slowlock(), and not by the callers. The downside to this is that it becomes a bit unclear when at what point we block. As such I've added a comment that the task blocks when calling __rt_mutex_slowlock() so readers can figure out when it is running again. (2) relaxes setting current's state through __set_current_state(), instead of it's more expensive barrier alternative. There was no need for the implied barrier as we're obviously not planning on blocking. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422857784.18096.1.camel@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7c98873a3077..3059bc2f022d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); } + __set_current_state(TASK_RUNNING); return ret; } @@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) { remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); @@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) remove_waiter(lock, waiter); -- cgit v1.2.3 From 652884fe0c7bd57f534c5fe68d6def0dc8c4b7ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Jan 2015 11:20:10 +0100 Subject: perf: Add a bit of paranoia Add a few WARN()s to catch things that should never happen. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150123125834.150481799@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b4a696c4dc76..b358cb38e4a5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1275,6 +1275,8 @@ static void perf_group_attach(struct perf_event *event) if (group_leader == event) return; + WARN_ON_ONCE(group_leader->ctx != event->ctx); + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && !is_software_event(event)) group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; @@ -1296,6 +1298,10 @@ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * We can have double detach due to exit/hot-unplug + close. */ @@ -1380,6 +1386,8 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; + + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: @@ -1442,6 +1450,10 @@ event_sched_out(struct perf_event *event, { u64 tstamp = perf_event_time(event); u64 delta; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * An event which could not be activated because of * filter mismatch still needs to have its timings @@ -7822,14 +7834,19 @@ static void perf_free_event(struct perf_event *event, put_event(parent); + raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* - * free an unexposed, unused context as created by inheritance by + * Free an unexposed, unused context as created by inheritance by * perf_event_init_task below, used by fork() in case of fail. + * + * Not all locks are strictly required, but take them anyway to be nice and + * help out with the lockdep assertions. */ void perf_event_free_task(struct task_struct *task) { -- cgit v1.2.3 From f63a8daa5812afef4f06c962351687e1ff9ccb2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Jan 2015 12:24:14 +0100 Subject: perf: Fix event->ctx locking There have been a few reported issues wrt. the lack of locking around changing event->ctx. This patch tries to address those. It avoids the whole rwsem thing; and while it appears to work, please give it some thought in review. What I did fail at is sensible runtime checks on the use of event->ctx, the RCU use makes it very hard. Signed-off-by: Peter Zijlstra (Intel) Cc: Paul E. McKenney Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150123125834.209535886@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 244 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 207 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b358cb38e4a5..417a96bf3d41 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -906,6 +906,77 @@ static void put_ctx(struct perf_event_context *ctx) } } +/* + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and + * perf_pmu_migrate_context() we need some magic. + * + * Those places that change perf_event::ctx will hold both + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. + * + * Lock ordering is by mutex address. There is one other site where + * perf_event_context::mutex nests and that is put_event(). But remember that + * that is a parent<->child context relation, and migration does not affect + * children, therefore these two orderings should not interact. + * + * The change in perf_event::ctx does not affect children (as claimed above) + * because the sys_perf_event_open() case will install a new event and break + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only + * concerned with cpuctx and that doesn't have children. + * + * The places that change perf_event::ctx will issue: + * + * perf_remove_from_context(); + * synchronize_rcu(); + * perf_install_in_context(); + * + * to affect the change. The remove_from_context() + synchronize_rcu() should + * quiesce the event, after which we can install it in the new location. This + * means that only external vectors (perf_fops, prctl) can perturb the event + * while in transit. Therefore all such accessors should also acquire + * perf_event_context::mutex to serialize against this. + * + * However; because event->ctx can change while we're waiting to acquire + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() + * function. + * + * Lock order: + * task_struct::perf_event_mutex + * perf_event_context::mutex + * perf_event_context::lock + * perf_event::child_mutex; + * perf_event::mmap_mutex + * mmap_sem + */ +static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event) +{ + struct perf_event_context *ctx; + +again: + rcu_read_lock(); + ctx = ACCESS_ONCE(event->ctx); + if (!atomic_inc_not_zero(&ctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock(&ctx->mutex); + if (event->ctx != ctx) { + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + + return ctx; +} + +static void perf_event_ctx_unlock(struct perf_event *event, + struct perf_event_context *ctx) +{ + mutex_unlock(&ctx->mutex); + put_ctx(ctx); +} + /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up @@ -1666,7 +1737,7 @@ int __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -void perf_event_disable(struct perf_event *event) +static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1707,6 +1778,19 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } + +/* + * Strictly speaking kernel users cannot create groups and therefore this + * interface does not need the perf_event_ctx_lock() magic. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_disable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, @@ -2170,7 +2254,7 @@ unlock: * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -void perf_event_enable(struct perf_event *event) +static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -2226,9 +2310,21 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } + +/* + * See perf_event_disable(); + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_enable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_enable); -int perf_event_refresh(struct perf_event *event, int refresh) +static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -2237,10 +2333,25 @@ int perf_event_refresh(struct perf_event *event, int refresh) return -EINVAL; atomic_add(refresh, &event->event_limit); - perf_event_enable(event); + _perf_event_enable(event); return 0; } + +/* + * See perf_event_disable() + */ +int perf_event_refresh(struct perf_event *event, int refresh) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_refresh(event, refresh); + perf_event_ctx_unlock(event, ctx); + + return ret; +} EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, @@ -3433,7 +3544,16 @@ static void perf_remove_from_owner(struct perf_event *event) rcu_read_unlock(); if (owner) { - mutex_lock(&owner->perf_event_mutex); + /* + * If we're here through perf_event_exit_task() we're already + * holding ctx->mutex which would be an inversion wrt. the + * normal lock order. + * + * However we can safely take this lock because its the child + * ctx->mutex. + */ + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); + /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex @@ -3559,12 +3679,13 @@ static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; struct perf_event_context *ctx = leader->ctx; - u64 values[5]; + int n = 0, size = 0, ret; u64 count, enabled, running; + u64 values[5]; + + lockdep_assert_held(&ctx->mutex); - mutex_lock(&ctx->mutex); count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; @@ -3579,7 +3700,7 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - goto unlock; + return -EFAULT; ret = size; @@ -3593,14 +3714,11 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; + return -EFAULT; } ret += size; } -unlock: - mutex_unlock(&ctx->mutex); return ret; } @@ -3672,8 +3790,14 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + int ret; - return perf_read_hw(event, buf, count); + ctx = perf_event_ctx_lock(event); + ret = perf_read_hw(event, buf, count); + perf_event_ctx_unlock(event, ctx); + + return ret; } static unsigned int perf_poll(struct file *file, poll_table *wait) @@ -3699,7 +3823,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } -static void perf_event_reset(struct perf_event *event) +static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); local64_set(&event->count, 0); @@ -3718,6 +3842,7 @@ static void perf_event_for_each_child(struct perf_event *event, struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) @@ -3731,14 +3856,13 @@ static void perf_event_for_each(struct perf_event *event, struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + lockdep_assert_held(&ctx->mutex); + event = event->group_leader; perf_event_for_each_child(event, func); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); - mutex_unlock(&ctx->mutex); } static int perf_event_period(struct perf_event *event, u64 __user *arg) @@ -3808,25 +3932,24 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { - struct perf_event *event = file->private_data; void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; + func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; + func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: - func = perf_event_reset; + func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); + return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); @@ -3873,6 +3996,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + long ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_ioctl(event, cmd, arg); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3895,11 +4031,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, int perf_event_task_enable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_enable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3907,11 +4047,15 @@ int perf_event_task_enable(void) int perf_event_task_disable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_disable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -7269,6 +7413,15 @@ out: return ret; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7284,7 +7437,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -7482,9 +7635,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; + gctx = group_leader->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&gctx->mutex, &ctx->mutex); - mutex_lock(&gctx->mutex); perf_remove_from_context(group_leader, false); /* @@ -7499,15 +7657,19 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(sibling); put_ctx(gctx); } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); + } else { + mutex_lock(&ctx->mutex); } WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); if (move_group) { + /* + * Wait for everybody to stop referencing the events through + * the old lists, before installing it on new lists. + */ synchronize_rcu(); + perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -7519,6 +7681,11 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); + + if (move_group) { + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -7626,7 +7793,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; - mutex_lock(&src_ctx->mutex); + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event, false); @@ -7634,11 +7805,9 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) put_ctx(src_ctx); list_add(&event->migrate_entry, &events); } - mutex_unlock(&src_ctx->mutex); synchronize_rcu(); - mutex_lock(&dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) @@ -7648,6 +7817,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) get_ctx(dst_ctx); } mutex_unlock(&dst_ctx->mutex); + mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -- cgit v1.2.3 From 8f95b435b62522aed3381aaea920de8d09ccabf3 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 27 Jan 2015 11:53:12 +0100 Subject: perf: Fix move_group() order Jiri reported triggering the new WARN_ON_ONCE in event_sched_out over the weekend: event_sched_out.isra.79+0x2b9/0x2d0 group_sched_out+0x69/0xc0 ctx_sched_out+0x106/0x130 task_ctx_sched_out+0x37/0x70 __perf_install_in_context+0x70/0x1a0 remote_function+0x48/0x60 generic_exec_single+0x15b/0x1d0 smp_call_function_single+0x67/0xa0 task_function_call+0x53/0x80 perf_install_in_context+0x8b/0x110 I think the below should cure this; if we install a group leader it will iterate the (still intact) group list and find its siblings and try and install those too -- even though those still have the old event->ctx -- in the new ctx. Upon installing the first group sibling we'd try and schedule out the group and trigger the above warn. Fix this by installing the group leader last, installing siblings would have no effect, they're not reachable through the group lists and therefore we don't schedule them. Also delay resetting the state until we're absolutely sure the events are quiescent. Reported-by: Jiri Olsa Reported-by: vincent.weaver@maine.edu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150126162639.GA21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 56 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 417a96bf3d41..142dbabc1615 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7645,16 +7645,9 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(group_leader, false); - /* - * Removing from the context ends up with disabled - * event. What we want here is event in the initial - * startup state, ready to be add into new context. - */ - perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling, false); - perf_event__state_init(sibling); put_ctx(gctx); } } else { @@ -7670,13 +7663,31 @@ SYSCALL_DEFINE5(perf_event_open, */ synchronize_rcu(); - perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); + /* + * Install the group siblings before the group leader. + * + * Because a group leader will try and install the entire group + * (through the sibling list, which is still in-tact), we can + * end up with siblings installed in the wrong context. + * + * By installing siblings first we NO-OP because they're not + * reachable through the group lists. + */ list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { + perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); + perf_install_in_context(ctx, group_leader, group_leader->cpu); + get_ctx(ctx); } perf_install_in_context(ctx, event, event->cpu); @@ -7806,8 +7817,35 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) list_add(&event->migrate_entry, &events); } + /* + * Wait for the events to quiesce before re-instating them. + */ synchronize_rcu(); + /* + * Re-instate events in 2 passes. + * + * Skip over group leaders and only install siblings on this first + * pass, siblings will not get enabled without a leader, however a + * leader will enable its siblings, even if those are still on the old + * context. + */ + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + if (event->group_leader == event) + continue; + + list_del(&event->migrate_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + + /* + * Once all the siblings are setup properly, install the group leaders + * to make it go. + */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) -- cgit v1.2.3 From a83fe28e2e45392464858a96745db26ac73670c8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2015 14:44:34 +0100 Subject: perf: Fix put_event() ctx lock So what I suspect; but I'm in zombie mode today it seems; is that while I initially thought that it was impossible for ctx to change when refcount dropped to 0, I now suspect its possible. Note that until perf_remove_from_context() the event is still active and visible on the lists. So a concurrent sys_perf_event_open() from another task into this task can race. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Stephane Eranian Cc: mark.rutland@arm.com Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150129134434.GB26304@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 142dbabc1615..f773fa13d7c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -947,7 +947,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event::mmap_mutex * mmap_sem */ -static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event) +static struct perf_event_context * +perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; @@ -960,7 +961,7 @@ again: } rcu_read_unlock(); - mutex_lock(&ctx->mutex); + mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); @@ -970,6 +971,12 @@ again: return ctx; } +static inline struct perf_event_context * +perf_event_ctx_lock(struct perf_event *event) +{ + return perf_event_ctx_lock_nested(event, 0); +} + static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { @@ -3572,7 +3579,7 @@ static void perf_remove_from_owner(struct perf_event *event) */ static void put_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; + struct perf_event_context *ctx; if (!atomic_long_dec_and_test(&event->refcount)) return; @@ -3580,7 +3587,6 @@ static void put_event(struct perf_event *event) if (!is_kernel_event(event)) perf_remove_from_owner(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: * @@ -3593,7 +3599,8 @@ static void put_event(struct perf_event *event) * the last filedesc died, so there is no possibility * to trigger the AB-BA case. */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From 7c60fc0e022858e19c66289ec65cc3effc8c0b1f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 28 Jan 2015 18:54:38 +0100 Subject: perf: Use POLLIN instead of POLL_IN for perf poll data in flag Currently we flag available data (via poll syscall) on perf fd with POLL_IN macro, which is normally used for SIGIO interface. We've been lucky, because POLLIN (0x1) is subset of POLL_IN (0x20001) and sys_poll (do_pollfd function) cut the extra bit out (0x20000). Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422467678-22341-1-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 146a5792b1d2..eadb95ce7aac 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -13,12 +13,13 @@ #include #include #include +#include #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLL_IN); + atomic_set(&handle->rb->poll, POLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); -- cgit v1.2.3 From cc34b98bacb0e102fb720d95a25fed5c6090a70d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Jan 2015 14:56:51 +0000 Subject: perf: Drop module reference on event init failure When initialising an event, perf_init_event will call try_module_get() to ensure that the PMU's module cannot be removed for the lifetime of the event, with __free_event() dropping the reference when the event is finally destroyed. If something fails after the event has been initialised, but before the event is installed, perf_event_alloc will drop the reference on the module. However, if we fail to initialise an event for some reason (e.g. we ask an uncore PMU to perform sampling, and it refuses to initialise the event), we do not drop the refcount. If we try to open such a bogus event without a precise IDR type, we will loop over each PMU in the pmus list, incrementing each of their refcounts without decrementing them. This patch adds a module_put when pmu->event_init(event) fails, ensuring that the refcounts are balanced in failure cases. As the innards of the precise and search based initialisation look very similar, this logic is hoisted out into a new helper function. While the early return for the failed try_module_get is removed from the search case, this is handled by the remaining return when ret is not -ENOENT. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420642611-22667-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f773fa13d7c2..37cc20e8aa3b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7027,6 +7027,20 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) +{ + int ret; + + if (!try_module_get(pmu->module)) + return -ENODEV; + event->pmu = pmu; + ret = pmu->event_init(event); + if (ret) + module_put(pmu->module); + + return ret; +} + struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; @@ -7039,24 +7053,14 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; -- cgit v1.2.3 From 2fde4f94e0a9531251e706fa57131b51b0df042e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Jan 2015 15:01:54 +0000 Subject: perf: Decouple unthrottling and rotating Currently the adjusments made as part of perf_event_task_tick() use the percpu rotation lists to iterate over any active PMU contexts, but these are not used by the context rotation code, having been replaced by separate (per-context) hrtimer callbacks. However, some manipulation of the rotation lists (i.e. removal of contexts) has remained in perf_rotate_context(). This leads to the following issues: * Contexts are not always removed from the rotation lists. Removal of PMUs which have been placed in rotation lists, but have not been removed by a hrtimer callback can result in corruption of the rotation lists (when memory backing the context is freed). This has been observed to result in hangs when PMU drivers built as modules are inserted and removed around the creation of events for said PMUs. * Contexts which do not require rotation may be removed from the rotation lists as a result of a hrtimer, and will not be considered by the unthrottling code in perf_event_task_tick. This patch fixes the issue by updating the rotation ist when events are scheduled in/out, ensuring that each rotation list stays in sync with the HW state. As each event holds a refcount on the module of its PMU, this ensures that when a PMU module is unloaded none of its CPU contexts can be in a rotation list. By maintaining a list of perf_event_contexts rather than perf_event_cpu_contexts, we don't need separate paths to handle the cpu and task contexts, which also makes the code a little simpler. As the rotation_list variables are not used for rotation, these are renamed to active_ctx_list, which better matches their current function. perf_pmu_rotate_{start,stop} are renamed to perf_pmu_ctx_{activate,deactivate}. Reported-by: Johannes Jensen Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Fengguang Wu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150129134511.GR17721@leverpostej Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 81 +++++++++++++++++----------------------------- 2 files changed, 30 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 216653466a67..5cad0e6f3552 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -469,6 +469,7 @@ struct perf_event_context { */ struct mutex mutex; + struct list_head active_ctx_list; struct list_head pinned_groups; struct list_head flexible_groups; struct list_head event_list; @@ -519,7 +520,6 @@ struct perf_cpu_context { int exclusive; struct hrtimer hrtimer; ktime_t hrtimer_interval; - struct list_head rotation_list; struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 37cc20e8aa3b..7f2fbb8b5069 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, rotation_list); +static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. + * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and + * perf_event_task_tick() are fully serialized because they're strictly cpu + * affine and perf_event_ctx{activate,deactivate} are called with IRQs + * disabled, while perf_event_task_tick is called from IRQ context. */ -static void perf_pmu_rotate_start(struct pmu *pmu) +static void perf_event_ctx_activate(struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = this_cpu_ptr(&rotation_list); + struct list_head *head = this_cpu_ptr(&active_ctx_list); WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); + WARN_ON(!list_empty(&ctx->active_ctx_list)); + + list_add(&ctx->active_ctx_list, head); +} + +static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +{ + WARN_ON(!irqs_disabled()); + + WARN_ON(list_empty(&ctx->active_ctx_list)); + + list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) @@ -1233,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_branch_stack++; list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1561,7 +1569,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; - ctx->nr_active--; + if (!--ctx->nr_active) + perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) @@ -1885,7 +1894,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; - ctx->nr_active++; + if (!ctx->nr_active++) + perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; @@ -2742,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); } /* @@ -3035,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx) list_rotate_left(&ctx->flexible_groups); } -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0; if (cpuctx->ctx.nr_events) { - remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } @@ -3077,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: - if (remove) - list_del_init(&cpuctx->rotation_list); return rotate; } @@ -3096,9 +3091,8 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; + struct list_head *head = this_cpu_ptr(&active_ctx_list); + struct perf_event_context *ctx, *tmp; int throttled; WARN_ON(!irqs_disabled()); @@ -3106,14 +3100,8 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; + list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - } } static int event_enable_on_exec(struct perf_event *event, @@ -3272,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); @@ -6954,7 +6943,6 @@ skip_type: __perf_cpu_hrtimer_init(cpuctx, cpu); - INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -8384,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); } } @@ -8405,22 +8393,11 @@ static void perf_event_init_cpu(int cpu) } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; - perf_pmu_rotate_stop(ctx->pmu); - rcu_read_lock(); list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) __perf_remove_from_context(&re); -- cgit v1.2.3 From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Feb 2015 16:45:18 -0600 Subject: livepatch: rename config to CONFIG_LIVEPATCH Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of the config and the code more consistent. Signed-off-by: Josh Poimboeuf Reviewed-by: Jingoo Han Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/livepatch.h | 4 ++-- arch/x86/kernel/Makefile | 2 +- include/linux/livepatch.h | 4 ++-- kernel/livepatch/Kconfig | 6 +++--- kernel/livepatch/Makefile | 2 +- samples/Kconfig | 4 ++-- samples/livepatch/Makefile | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 29b095231276..11970b076862 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,7 +17,7 @@ config X86_64 depends on 64BIT select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF - select HAVE_LIVE_PATCHING + select HAVE_LIVEPATCH ### Arch settings config X86 diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 26e58134c8cb..a455a53d789a 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -24,7 +24,7 @@ #include #include -#ifdef CONFIG_LIVE_PATCHING +#ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) { #ifndef CC_USING_FENTRY @@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } #else -#error Live patching support is disabled; check CONFIG_LIVE_PATCHING +#error Live patching support is disabled; check CONFIG_LIVEPATCH #endif #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 316b34e74c15..732223496968 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -63,7 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVE_PATCHING) += livepatch.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index f14c6fb262b4..95023fd8b00d 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -24,7 +24,7 @@ #include #include -#if IS_ENABLED(CONFIG_LIVE_PATCHING) +#if IS_ENABLED(CONFIG_LIVEPATCH) #include @@ -128,6 +128,6 @@ extern int klp_unregister_patch(struct klp_patch *); extern int klp_enable_patch(struct klp_patch *); extern int klp_disable_patch(struct klp_patch *); -#endif /* CONFIG_LIVE_PATCHING */ +#endif /* CONFIG_LIVEPATCH */ #endif /* _LINUX_LIVEPATCH_H_ */ diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 347ee2221137..045022557936 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,15 +1,15 @@ -config HAVE_LIVE_PATCHING +config HAVE_LIVEPATCH bool help Arch supports kernel live patching -config LIVE_PATCHING +config LIVEPATCH bool "Kernel Live Patching" depends on DYNAMIC_FTRACE_WITH_REGS depends on MODULES depends on SYSFS depends on KALLSYMS_ALL - depends on HAVE_LIVE_PATCHING + depends on HAVE_LIVEPATCH help Say Y here if you want to support kernel live patching. This option has no runtime impact until a kernel "patch" diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index 7c1f00861428..e8780c0901d9 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ -obj-$(CONFIG_LIVE_PATCHING) += livepatch.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o livepatch-objs := core.o diff --git a/samples/Kconfig b/samples/Kconfig index 0aed20df5f0b..224ebb46bed5 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -63,9 +63,9 @@ config SAMPLE_RPMSG_CLIENT to communicate with an AMP-configured remote processor over the rpmsg bus. -config SAMPLE_LIVE_PATCHING +config SAMPLE_LIVEPATCH tristate "Build live patching sample -- loadable modules only" - depends on LIVE_PATCHING && m + depends on LIVEPATCH && m help Builds a sample live patch that replaces the procfs handler for /proc/cmdline to print "this has been live patched". diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile index 7f1cdc131a02..10319d7ea0b1 100644 --- a/samples/livepatch/Makefile +++ b/samples/livepatch/Makefile @@ -1 +1 @@ -obj-$(CONFIG_SAMPLE_LIVE_PATCHING) += livepatch-sample.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o -- cgit v1.2.3 From 2d926c15d629a13914ce3e5f26354f6a0ac99e70 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Feb 2015 16:45:26 -0800 Subject: hrtimer: Fix incorrect tai offset calculation for non high-res timer systems I noticed some CLOCK_TAI timer test failures on one of my less-frequently used configurations. And after digging in I found in 76f4108892d9 (Cleanup hrtimer accessors to the timekepeing state), the hrtimer_get_softirq_time tai offset calucation was incorrectly rewritten, as the tai offset we return shold be from CLOCK_MONOTONIC, and not CLOCK_REALTIME. This results in CLOCK_TAI timers expiring early on non-highres capable machines. This patch fixes the issue, calculating the tai time properly from the monotonic base. Signed-off-by: John Stultz Cc: Thomas Gleixner Cc: stable # 3.17+ Link: http://lkml.kernel.org/r/1423097126-10236-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..d8c724cda37b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); boot = ktime_add(mono, off_boot); xtim = ktime_add(mono, off_real); - tai = ktime_add(xtim, off_tai); + tai = ktime_add(mono, off_tai); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; -- cgit v1.2.3 From 90e97820619dc912b52cc9d103272819d8b51259 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 5 Feb 2015 13:44:43 +0800 Subject: resources: Move struct resource_list_entry from ACPI into resource core Currently ACPI, PCI and pnp all implement the same resource list management with different data structure. We need to transfer from one data structure into another when passing resources from one subsystem into another subsystem. So move struct resource_list_entry from ACPI into resource core and rename it as resource_entry, then it could be reused by different subystems and avoid the data structure conversion. Introduce dedicated header file resource_ext.h instead of embedding it into ioport.h to avoid header file inclusion order issues. Signed-off-by: Jiang Liu Acked-by: Vinod Koul Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpss.c | 8 ++--- drivers/acpi/acpi_platform.c | 4 +-- drivers/acpi/resource.c | 17 ++++------ drivers/dma/acpi-dma.c | 10 +++--- include/linux/acpi.h | 12 +------ include/linux/resource_ext.h | 77 ++++++++++++++++++++++++++++++++++++++++++++ kernel/resource.c | 25 ++++++++++++++ 7 files changed, 120 insertions(+), 33 deletions(-) create mode 100644 include/linux/resource_ext.h (limited to 'kernel') diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 4f3febf8a589..dfd1b8095dad 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -313,7 +313,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev, { struct lpss_device_desc *dev_desc; struct lpss_private_data *pdata; - struct resource_list_entry *rentry; + struct resource_entry *rentry; struct list_head resource_list; struct platform_device *pdev; int ret; @@ -333,12 +333,12 @@ static int acpi_lpss_create_device(struct acpi_device *adev, goto err_out; list_for_each_entry(rentry, &resource_list, node) - if (resource_type(&rentry->res) == IORESOURCE_MEM) { + if (resource_type(rentry->res) == IORESOURCE_MEM) { if (dev_desc->prv_size_override) pdata->mmio_size = dev_desc->prv_size_override; else - pdata->mmio_size = resource_size(&rentry->res); - pdata->mmio_base = ioremap(rentry->res.start, + pdata->mmio_size = resource_size(rentry->res); + pdata->mmio_base = ioremap(rentry->res->start, pdata->mmio_size); break; } diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 6ba8beb6b9d2..1284138e42ab 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -45,7 +45,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) struct platform_device *pdev = NULL; struct acpi_device *acpi_parent; struct platform_device_info pdevinfo; - struct resource_list_entry *rentry; + struct resource_entry *rentry; struct list_head resource_list; struct resource *resources = NULL; int count; @@ -71,7 +71,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) } count = 0; list_for_each_entry(rentry, &resource_list, node) - resources[count++] = rentry->res; + resources[count++] = *rentry->res; acpi_dev_free_resource_list(&resource_list); } diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 3ea0d17eb951..4752b9939987 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -444,12 +444,7 @@ EXPORT_SYMBOL_GPL(acpi_dev_resource_interrupt); */ void acpi_dev_free_resource_list(struct list_head *list) { - struct resource_list_entry *rentry, *re; - - list_for_each_entry_safe(rentry, re, list, node) { - list_del(&rentry->node); - kfree(rentry); - } + resource_list_free(list); } EXPORT_SYMBOL_GPL(acpi_dev_free_resource_list); @@ -464,16 +459,16 @@ struct res_proc_context { static acpi_status acpi_dev_new_resource_entry(struct resource_win *win, struct res_proc_context *c) { - struct resource_list_entry *rentry; + struct resource_entry *rentry; - rentry = kmalloc(sizeof(*rentry), GFP_KERNEL); + rentry = resource_list_create_entry(NULL, 0); if (!rentry) { c->error = -ENOMEM; return AE_NO_MEMORY; } - rentry->res = win->res; + *rentry->res = win->res; rentry->offset = win->offset; - list_add_tail(&rentry->node, c->list); + resource_list_add_tail(rentry, c->list); c->count++; return AE_OK; } @@ -534,7 +529,7 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares, * returned as the final error code. * * The resultant struct resource objects are put on the list pointed to by - * @list, that must be empty initially, as members of struct resource_list_entry + * @list, that must be empty initially, as members of struct resource_entry * objects. Callers of this routine should use %acpi_dev_free_resource_list() to * free that list. * diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c index de361a156b34..5a635646e05c 100644 --- a/drivers/dma/acpi-dma.c +++ b/drivers/dma/acpi-dma.c @@ -43,7 +43,7 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp, { const struct acpi_csrt_shared_info *si; struct list_head resource_list; - struct resource_list_entry *rentry; + struct resource_entry *rentry; resource_size_t mem = 0, irq = 0; int ret; @@ -56,10 +56,10 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp, return 0; list_for_each_entry(rentry, &resource_list, node) { - if (resource_type(&rentry->res) == IORESOURCE_MEM) - mem = rentry->res.start; - else if (resource_type(&rentry->res) == IORESOURCE_IRQ) - irq = rentry->res.start; + if (resource_type(rentry->res) == IORESOURCE_MEM) + mem = rentry->res->start; + else if (resource_type(rentry->res) == IORESOURCE_IRQ) + irq = rentry->res->start; } acpi_dev_free_resource_list(&resource_list); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e818decb631f..e53822148b6a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -27,6 +27,7 @@ #include #include /* for struct resource */ +#include #include #include @@ -285,11 +286,6 @@ extern int pnpacpi_disabled; #define PXM_INVAL (-1) -struct resource_win { - struct resource res; - resource_size_t offset; -}; - bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_address_space(struct acpi_resource *ares, @@ -300,12 +296,6 @@ unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable); bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index, struct resource *res); -struct resource_list_entry { - struct list_head node; - struct resource res; - resource_size_t offset; -}; - void acpi_dev_free_resource_list(struct list_head *list); int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list, int (*preproc)(struct acpi_resource *, void *), diff --git a/include/linux/resource_ext.h b/include/linux/resource_ext.h new file mode 100644 index 000000000000..e2bf63d881d4 --- /dev/null +++ b/include/linux/resource_ext.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2015, Intel Corporation + * Author: Jiang Liu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _LINUX_RESOURCE_EXT_H +#define _LINUX_RESOURCE_EXT_H +#include +#include +#include +#include + +/* Represent resource window for bridge devices */ +struct resource_win { + struct resource res; /* In master (CPU) address space */ + resource_size_t offset; /* Translation offset for bridge */ +}; + +/* + * Common resource list management data structure and interfaces to support + * ACPI, PNP and PCI host bridge etc. + */ +struct resource_entry { + struct list_head node; + struct resource *res; /* In master (CPU) address space */ + resource_size_t offset; /* Translation offset for bridge */ + struct resource __res; /* Default storage for res */ +}; + +extern struct resource_entry * +resource_list_create_entry(struct resource *res, size_t extra_size); +extern void resource_list_free(struct list_head *head); + +static inline void resource_list_add(struct resource_entry *entry, + struct list_head *head) +{ + list_add(&entry->node, head); +} + +static inline void resource_list_add_tail(struct resource_entry *entry, + struct list_head *head) +{ + list_add_tail(&entry->node, head); +} + +static inline void resource_list_del(struct resource_entry *entry) +{ + list_del(&entry->node); +} + +static inline void resource_list_free_entry(struct resource_entry *entry) +{ + kfree(entry); +} + +static inline void +resource_list_destroy_entry(struct resource_entry *entry) +{ + resource_list_del(entry); + resource_list_free_entry(entry); +} + +#define resource_list_for_each_entry(entry, list) \ + list_for_each_entry((entry), (list), node) + +#define resource_list_for_each_entry_safe(entry, tmp, list) \ + list_for_each_entry_safe((entry), (tmp), (list), node) + +#endif /* _LINUX_RESOURCE_EXT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index 0bcebffc4e77..19f2357dfda3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr) return err; } +struct resource_entry *resource_list_create_entry(struct resource *res, + size_t extra_size) +{ + struct resource_entry *entry; + + entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL); + if (entry) { + INIT_LIST_HEAD(&entry->node); + entry->res = res ? res : &entry->__res; + } + + return entry; +} +EXPORT_SYMBOL(resource_list_create_entry); + +void resource_list_free(struct list_head *head) +{ + struct resource_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, node) + resource_list_destroy_entry(entry); +} +EXPORT_SYMBOL(resource_list_free); + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) -- cgit v1.2.3 From f638f4dc0880d515c807a67b8210885a4a4f18bb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 6 Feb 2015 10:36:32 -0600 Subject: livepatch: add missing newline to error message Signed-off-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 9adf86bfbcd5..ff7f47d026ac 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -211,7 +211,7 @@ static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) if (kallsyms_on_each_symbol(klp_verify_callback, &args)) return 0; - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?", + pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", name, addr); return -EINVAL; } -- cgit v1.2.3 From 7215853e985a4bef1a6c14e00e89dfec84f1e457 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Wed, 17 Dec 2014 18:50:56 -0800 Subject: tracing: Fix unmapping loop in tracing_mark_write Commit 6edb2a8a385f0cdef51dae37ff23e74d76d8a6ce introduced an array map_pages that contains the addresses returned by kmap_atomic. However, when unmapping those pages, map_pages[0] is unmapped before map_pages[1], breaking the nesting requirement as specified in the documentation for kmap_atomic/kunmap_atomic. This was caught by the highmem debug code present in kunmap_atomic. Fix the loop to do the unmapping properly. Link: http://lkml.kernel.org/r/1418871056-6614-1-git-send-email-markivx@codeaurora.org Cc: stable@vger.kernel.org # 3.5+ Reviewed-by: Stephen Boyd Reported-by: Lime Yang Signed-off-by: Vikram Mulukutla Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5afce60e1b68..2078b86750e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4928,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - for (i = 0; i < nr_pages; i++){ + for (i = nr_pages - 1; i >= 0; i--) { kunmap_atomic(map_page[i]); put_page(pages[i]); } -- cgit v1.2.3 From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:59 -0800 Subject: rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 8 +- fs/inode.c | 1 - include/linux/fs.h | 4 +- include/linux/mm.h | 6 -- include/linux/mm_types.h | 4 +- include/linux/rmap.h | 2 - kernel/fork.c | 8 +- mm/migrate.c | 32 ------- mm/mmap.c | 24 ++--- mm/rmap.c | 225 +-------------------------------------------- mm/swap.c | 4 +- 11 files changed, 18 insertions(+), 300 deletions(-) (limited to 'kernel') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index d79b008e4a32..3f9f808b5119 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -317,10 +317,10 @@ maps this page at its virtual address. about doing this. The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear - an empty list, just mark the architecture private page flag bit. - Later, in update_mmu_cache(), a check is made of this flag bit, - and if set the flush is done and the flag bit is cleared. + page->mapping->i_mmap is an empty tree, just mark the architecture + private page flag bit. Later, in update_mmu_cache(), a check is + made of this flag bit, and if set the flush is done and the flag + bit is cleared. IMPORTANT NOTE: It is often important, if you defer the flush, that the actual flush occurs on the same CPU diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..c760fac33c92 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping) INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 47f557c7ef7e..60acab209701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,6 @@ struct address_space { spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ddd9d1d6268..18391eec4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6d34aa266a8c..3b1d20fb0848 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -273,15 +273,13 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree, or - * linkage of vma in the address_space->i_mmap_nonlinear list. + * linkage into the address_space->i_mmap interval tree. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } linear; - struct list_head nonlinear; } shared; /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e56352..b38f559130d5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition - * file_nonlinear: for handling file nonlinear mapping * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ @@ -255,7 +254,6 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1..b379d9abddc7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..6e284bcca8bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,37 +178,6 @@ out: return SWAP_AGAIN; } -/* - * Congratulations to trinity for discovering this bug. - * mm/fremap.c's remap_file_pages() accepts any range within a single vma to - * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then - * replace the specified range by file ptes throughout (maybe populated after). - * If page migration finds a page within that range, while it's still located - * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: - * zap_pte() clears the temporary migration entry before mmap_sem is dropped. - * But if the migrating page is in a part of the vma outside the range to be - * remapped, then it will not be cleared, and remove_migration_ptes() needs to - * deal with it. Fortunately, this part of the vma is of course still linear, - * so we just need to use linear location on the nonlinear list. - */ -static int remove_linear_migration_ptes_from_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long addr; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr >= vma->vm_start && addr < vma->vm_end) - remove_migration_pte(page, vma, addr, arg); - } - return SWAP_AGAIN; -} - /* * Get rid of all migration entries and replace them by * references to the indicated page. @@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, - .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); diff --git a/mm/mmap.c b/mm/mmap.c index e023dc5e59a8..14d84666e8ba 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.nonlinear); - else - vma_interval_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } @@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) { - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); - if (adjust_next) - uprobe_munmap(next, next->vm_start, - next->vm_end); - } + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); if (insert) { @@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * * mmap_sem in write mode is required in order to block all operations * that could modify pagetables and free pages without need of - * altering the vma layout (for example populate_range() with - * nonlinear vmas). It's also needed in write mode to avoid new + * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row diff --git a/mm/rmap.c b/mm/rmap.c index 71cd5bd0c17d..70b32498d4f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (!vma->vm_file || - vma->vm_file->f_mapping != page->mapping) + } else if (page->mapping) { + if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ @@ -1316,211 +1314,6 @@ out_mlock: return ret; } -/* - * objrmap doesn't work for nonlinear VMAs because the assumption that - * offset-into-file correlates with offset-into-virtual-addresses does not hold. - * Consequently, given a particular page and its ->index, we cannot locate the - * ptes which are mapping that page without an exhaustive linear search. - * - * So what this code does is a mini "virtual scan" of each nonlinear VMA which - * maps the file to which the target page belongs. The ->vm_private_data field - * holds the current cursor into that scan. Successive searches will circulate - * around the vma's virtual address space. - * - * So as more replacement pressure is applied to the pages in a nonlinear VMA, - * more scanning pressure is placed against them as well. Eventually pages - * will become fully unmapped and are eligible for eviction. - * - * For very sparsely populated VMAs this is a little inefficient - chances are - * there there won't be many ptes located within the scan cluster. In this case - * maybe we could scan further - to the end of the pte page, perhaps. - * - * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can - * acquire it without blocking. If vma locked, mlock the pages in the cluster, - * rather than unmapping them. If we encounter the "check_page" that vmscan is - * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. - */ -#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) -#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) - -static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, - struct vm_area_struct *vma, struct page *check_page) -{ - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; - struct page *page; - unsigned long address; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - unsigned long end; - int ret = SWAP_AGAIN; - int locked_vma = 0; - - address = (vma->vm_start + cursor) & CLUSTER_MASK; - end = address + CLUSTER_SIZE; - if (address < vma->vm_start) - address = vma->vm_start; - if (end > vma->vm_end) - end = vma->vm_end; - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return ret; - - mmun_start = address; - mmun_end = end; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - /* - * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, - * keep the sem while scanning the cluster for mlocking pages. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - locked_vma = (vma->vm_flags & VM_LOCKED); - if (!locked_vma) - up_read(&vma->vm_mm->mmap_sem); /* don't need it */ - } - - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - - for (; address < end; pte++, address += PAGE_SIZE) { - if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, address, *pte); - BUG_ON(!page || PageAnon(page)); - - if (locked_vma) { - if (page == check_page) { - /* we know we have check_page locked */ - mlock_vma_page(page); - ret = SWAP_MLOCK; - } else if (trylock_page(page)) { - /* - * If we can lock the page, perform mlock. - * Otherwise leave the page alone, it will be - * eventually encountered again later. - */ - mlock_vma_page(page); - unlock_page(page); - } - continue; /* don't unmap */ - } - - /* - * No need for _notify because we're within an - * mmu_notifier_invalidate_range_ {start|end} scope. - */ - if (ptep_clear_flush_young(vma, address, pte)) - continue; - - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); - - /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(pteval)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, address, pte, ptfile); - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); - - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, MM_FILEPAGES); - (*mapcount)--; - } - pte_unmap_unlock(pte - 1, ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (locked_vma) - up_read(&vma->vm_mm->mmap_sem); - return ret; -} - -static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - return SWAP_FAIL; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - return ret; - - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - while (cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - return ret; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; - - return ret; -} - bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .file_nonlinear = try_to_unmap_nonlinear, .anon_lock = page_lock_anon_vma_read, }; @@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page) .rmap_one = try_to_unmap_one, .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, - /* - * We don't bother to try to find the munlocked page in - * nonlinears. It's costly. Instead, later, page reclaim logic - * may call try_to_unmap() and recover PG_mlocked lazily. - */ - .file_nonlinear = NULL, .anon_lock = page_lock_anon_vma_read, }; @@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; } - if (!rwc->file_nonlinear) - goto done; - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto done; - - ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: i_mmap_unlock_read(mapping); return ret; diff --git a/mm/swap.c b/mm/swap.c index 8a12b33936b4..5b3087228b99 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,10 +1140,8 @@ void __init swap_setup(void) if (bdi_init(swapper_spaces[0].backing_dev_info)) panic("Failed to init swap bdi"); - for (i = 0; i < MAX_SWAPFILES; i++) { + for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); - INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); - } #endif /* Use a smaller cluster for small-memory machines */ -- cgit v1.2.3 From 3cd7645de624939c38f5124b4ac15f8b35a1a8b7 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Feb 2015 14:11:33 -0800 Subject: mm, hugetlb: remove unnecessary lower bound on sysctl handlers"? Commit ed4d4902ebdd ("mm, hugetlb: remove hugetlb_zero and hugetlb_infinity") replaced 'unsigned long hugetlb_zero' with 'int zero' leading to out-of-bounds access in proc_doulongvec_minmax(). Use '.extra1 = NULL' instead of '.extra1 = &zero'. Passing NULL is equivalent to passing minimal value, which is 0 for unsigned types. Fixes: ed4d4902ebdd ("mm, hugetlb: remove hugetlb_zero and hugetlb_infinity") Signed-off-by: Andrey Ryabinin Reported-by: Dmitry Vyukov Suggested-by: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137c7f69b264..88ea2d6e0031 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = &zero, }, #endif { @@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = &zero, }, #endif { -- cgit v1.2.3 From 1e0d6714aceb770b04161fbedd7765d0e1fc27bd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Feb 2015 22:14:53 -0500 Subject: ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96079180de3d..5040d44fe5a3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -445,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; + wait_queue_head_t full_waiters; bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; }; /* @@ -527,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); + if (rbwork->wakeup_full) { + rbwork->wakeup_full = false; + wake_up_all(&rbwork->full_waiters); + } } /** @@ -551,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + /* Full only makes sense on per cpu reads */ + full = false; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; @@ -562,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) while (true) { - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); /* * The events can happen in critical sections where @@ -584,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ - work->waiters_pending = true; + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; if (signal_pending(current)) { ret = -EINTR; @@ -613,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) schedule(); } - finish_wait(&work->waiters, &wait); + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); return ret; } @@ -1228,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -2799,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { + bool pagebusy; + if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -2810,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /** -- cgit v1.2.3 From 49550b605587924b3336386caae53200c68969d3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 11 Feb 2015 15:26:12 -0800 Subject: oom: add helpers for setting and clearing TIF_MEMDIE This patchset addresses a race which was described in the changelog for 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend"): : PM freezer relies on having all tasks frozen by the time devices are : getting frozen so that no task will touch them while they are getting : frozen. But OOM killer is allowed to kill an already frozen task in order : to handle OOM situtation. In order to protect from late wake ups OOM : killer is disabled after all tasks are frozen. This, however, still keeps : a window open when a killed task didn't manage to die by the time : freeze_processes finishes. The original patch hasn't closed the race window completely because that would require a more complex solution as it can be seen by this patchset. The primary motivation was to close the race condition between OOM killer and PM freezer _completely_. As Tejun pointed out, even though the race condition is unlikely the harder it would be to debug weird bugs deep in the PM freezer when the debugging options are reduced considerably. I can only speculate what might happen when a task is still runnable unexpectedly. On a plus side and as a side effect the oom enable/disable has a better (full barrier) semantic without polluting hot paths. I have tested the series in KVM with 100M RAM: - many small tasks (20M anon mmap) which are triggering OOM continually - s2ram which resumes automatically is triggered in a loop echo processors > /sys/power/pm_test while true do echo mem > /sys/power/state sleep 1s done - simple module which allocates and frees 20M in 8K chunks. If it sees freezing(current) then it tries another round of allocation before calling try_to_freeze - debugging messages of PM stages and OOM killer enable/disable/fail added and unmark_oom_victim is delayed by 1s after it clears TIF_MEMDIE and before it wakes up waiters. - rebased on top of the current mmotm which means some necessary updates in mm/oom_kill.c. mark_tsk_oom_victim is now called under task_lock but I think this should be OK because __thaw_task shouldn't interfere with any locking down wake_up_process. Oleg? As expected there are no OOM killed tasks after oom is disabled and allocations requested by the kernel thread are failing after all the tasks are frozen and OOM disabled. I wasn't able to catch a race where oom_killer_disable would really have to wait but I kinda expected the race is really unlikely. [ 242.609330] Killed process 2992 (mem_eater) total-vm:24412kB, anon-rss:2164kB, file-rss:4kB [ 243.628071] Unmarking 2992 OOM victim. oom_victims: 1 [ 243.636072] (elapsed 2.837 seconds) done. [ 243.641985] Trying to disable OOM killer [ 243.643032] Waiting for concurent OOM victims [ 243.644342] OOM killer disabled [ 243.645447] Freezing remaining freezable tasks ... (elapsed 0.005 seconds) done. [ 243.652983] Suspending console(s) (use no_console_suspend to debug) [ 243.903299] kmem_eater: page allocation failure: order:1, mode:0x204010 [...] [ 243.992600] PM: suspend of devices complete after 336.667 msecs [ 243.993264] PM: late suspend of devices complete after 0.660 msecs [ 243.994713] PM: noirq suspend of devices complete after 1.446 msecs [ 243.994717] ACPI: Preparing to enter system sleep state S3 [ 243.994795] PM: Saving platform NVS memory [ 243.994796] Disabling non-boot CPUs ... The first 2 patches are simple cleanups for OOM. They should go in regardless the rest IMO. Patches 3 and 4 are trivial printk -> pr_info conversion and they should go in ditto. The main patch is the last one and I would appreciate acks from Tejun and Rafael. I think the OOM part should be OK (except for __thaw_task vs. task_lock where a look from Oleg would appreciated) but I am not so sure I haven't screwed anything in the freezer code. I have found several surprises there. This patch (of 5): This patch is just a preparatory and it doesn't introduce any functional change. Note: I am utterly unhappy about lowmemory killer abusing TIF_MEMDIE just to wait for the oom victim and to prevent from new killing. This is just a side effect of the flag. The primary meaning is to give the oom victim access to the memory reserves and that shouldn't be necessary here. Signed-off-by: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/lowmemorykiller.c | 7 ++++++- include/linux/oom.h | 4 ++++ kernel/exit.c | 2 +- mm/memcontrol.c | 2 +- mm/oom_kill.c | 23 ++++++++++++++++++++--- 5 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index b545d3d1da3e..feafa172b155 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize); lowmem_deathpending_timeout = jiffies + HZ; - set_tsk_thread_flag(selected, TIF_MEMDIE); + /* + * FIXME: lowmemorykiller shouldn't abuse global OOM killer + * infrastructure. There is no real reason why the selected + * task should have access to the memory reserves. + */ + mark_tsk_oom_victim(selected); send_sig(SIGKILL, selected, 0); rem += selected_tasksize; } diff --git a/include/linux/oom.h b/include/linux/oom.h index 76200984d1e2..b42b80f88c3a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } +extern void mark_tsk_oom_victim(struct task_struct *tsk); + +extern void unmark_oom_victim(void); + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/kernel/exit.c b/kernel/exit.c index 6806c55475ee..02b3d1ab2ec0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - clear_thread_flag(TIF_MEMDIE); + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 11c9e6a1dad5..fe4d258ef32b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1556,7 +1556,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - set_thread_flag(TIF_MEMDIE); + mark_tsk_oom_victim(current); return; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 294493a7ae4b..80b34e285f96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,6 +416,23 @@ void note_oom_kill(void) atomic_inc(&oom_kills); } +/** + * mark_tsk_oom_victim - marks the given taks as OOM victim. + * @tsk: task to mark + */ +void mark_tsk_oom_victim(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk, TIF_MEMDIE); +} + +/** + * unmark_oom_victim - unmarks the current task as OOM victim. + */ +void unmark_oom_victim(void) +{ + clear_thread_flag(TIF_MEMDIE); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -440,7 +457,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ task_lock(p); if (p->mm && task_will_free_mem(p)) { - set_tsk_thread_flag(p, TIF_MEMDIE); + mark_tsk_oom_victim(p); task_unlock(p); put_task_struct(p); return; @@ -495,7 +512,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; - set_tsk_thread_flag(victim, TIF_MEMDIE); + mark_tsk_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -652,7 +669,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { - set_thread_flag(TIF_MEMDIE); + mark_tsk_oom_victim(current); return; } -- cgit v1.2.3 From 35536ae170f01fb7e5ca032d5324d03e9e5a36bd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 11 Feb 2015 15:26:18 -0800 Subject: PM: convert printk to pr_* equivalent While touching this area let's convert printk to pr_*. This also makes the printing of continuation lines done properly. Signed-off-by: Michal Hocko Acked-by: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5a6ec8678b9a..3ac45f192e9f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = elapsed_msecs64; if (todo) { - printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " + pr_cont("\n"); + pr_err("Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, @@ -101,7 +101,7 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); } @@ -155,7 +155,7 @@ int freeze_processes(void) atomic_inc(&system_freezing_cnt); pm_wakeup_clear(); - printk("Freezing user space processes ... "); + pr_info("Freezing user space processes ... "); pm_freezing = true; oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); @@ -171,13 +171,13 @@ int freeze_processes(void) if (oom_kills_count() != oom_kills_saved && !check_frozen_processes()) { __usermodehelper_set_disable_depth(UMH_ENABLED); - printk("OOM in progress."); + pr_cont("OOM in progress."); error = -EBUSY; } else { - printk("done."); + pr_cont("done."); } } - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -197,13 +197,14 @@ int freeze_kernel_threads(void) { int error; - printk("Freezing remaining freezable tasks ... "); + pr_info("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) - printk("done."); + pr_cont("done."); - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -224,7 +225,7 @@ void thaw_processes(void) oom_killer_enable(); - printk("Restarting tasks ... "); + pr_info("Restarting tasks ... "); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -243,7 +244,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -252,7 +253,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - printk("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ... "); thaw_workqueues(); @@ -264,5 +265,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - printk("done.\n"); + pr_cont("done.\n"); } -- cgit v1.2.3 From c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 11 Feb 2015 15:26:24 -0800 Subject: oom, PM: make OOM detection in the freezer path raceless Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko Suggested-by: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 5 +- include/linux/oom.h | 14 ++---- kernel/exit.c | 3 +- kernel/power/process.c | 50 ++++--------------- mm/memcontrol.c | 2 +- mm/oom_kill.c | 132 +++++++++++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 17 +------ 7 files changed, 132 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0071469ecbf1..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, - 0, NULL, true); + if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), + GFP_KERNEL, 0, NULL, true)) + pr_info("OOM request ignored because killer is disabled\n"); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index b42b80f88c3a..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); extern bool oom_killer_disabled; - -static inline void oom_killer_disable(void) -{ - oom_killer_disabled = true; -} - -static inline void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} +extern bool oom_killer_disable(void); +extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/exit.c b/kernel/exit.c index 02b3d1ab2ec0..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - unmark_oom_victim(); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/power/process.c b/kernel/power/process.c index 3ac45f192e9f..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } -static bool __check_frozen_processes(void) -{ - struct task_struct *g, *p; - - for_each_process_thread(g, p) - if (p != current && !freezer_should_skip(p) && !frozen(p)) - return false; - - return true; -} - -/* - * Returns true if all freezable tasks (except for current) are frozen already - */ -static bool check_frozen_processes(void) -{ - bool ret; - - read_lock(&tasklist_lock); - ret = __check_frozen_processes(); - read_unlock(&tasklist_lock); - return ret; -} - /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -142,7 +118,6 @@ static bool check_frozen_processes(void) int freeze_processes(void) { int error; - int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -157,29 +132,22 @@ int freeze_processes(void) pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; - oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - - /* - * There might have been an OOM kill while we were - * freezing tasks and the killed task might be still - * on the way out so we have to double check for race. - */ - if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { - __usermodehelper_set_disable_depth(UMH_ENABLED); - pr_cont("OOM in progress."); - error = -EBUSY; - } else { - pr_cont("done."); - } + pr_cont("done."); } pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe4d258ef32b..fbf64e6f64e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle) + if (!handle || oom_killer_disabled) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3cbd76b8c13b..b8df76ee2be3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } /* - * Number of OOM killer invocations (including memcg OOM killer). - * Primarily used by PM freezer to check for potential races with - * OOM killed frozen task. + * Number of OOM victims in flight */ -static atomic_t oom_kills = ATOMIC_INIT(0); +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -int oom_kills_count(void) -{ - return atomic_read(&oom_kills); -} - -void note_oom_kill(void) -{ - atomic_inc(&oom_kills); -} +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); /** * mark_tsk_oom_victim - marks the given taks as OOM victim. * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. */ void mark_tsk_oom_victim(struct task_struct *tsk) { - set_tsk_thread_flag(tsk, TIF_MEMDIE); - + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk) * that TIF_MEMDIE tasks should be ignored. */ __thaw_task(tsk); + atomic_inc(&oom_victims); } /** * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() */ void unmark_oom_victim(void) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) } /** - * out_of_memory - kill the "best" process when we run out of memory + * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 @@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; @@ -718,6 +771,32 @@ out: schedule_timeout_killable(1); } +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + /* * The pagefault handler calls here because it is out of memory, so kill a * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a @@ -727,12 +806,25 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; + down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - return; + goto unlock; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - out_of_memory(NULL, 0, 0, NULL, false); + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } +unlock: + up_read(&oom_sem); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 641d5a9a8617..134e25525044 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; - if (oom_killer_disabled) - return NULL; - /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. @@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } - /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. - */ - note_oom_kill(); - /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if @@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); - *did_some_progress = 1; + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; -- cgit v1.2.3 From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:50 -0800 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++------ arch/x86/mm/pgtable.c | 14 +++++++++----- fs/proc/task_mmu.c | 9 ++++++--- include/linux/mm.h | 24 ++++++++++++++++++++++++ include/linux/mm_types.h | 3 ++- kernel/fork.c | 3 +++ mm/debug.c | 3 ++- mm/hugetlb.c | 8 ++++++-- mm/memory.c | 15 +++++++++------ mm/mmap.c | 4 +++- mm/oom_kill.c | 9 +++++---- 11 files changed, 75 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 4415aa915681..e9c706e4627a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -555,12 +555,12 @@ this is causing problems for your system/application. oom_dump_tasks -Enables a system-wide task dump (excluding kernel threads) to be -produced when the kernel performs an OOM-killing and includes such -information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, -oom_score_adj score, and name. This is helpful to determine why the -OOM killer was invoked, to identify the rogue task that caused it, -and to determine why the OOM killer chose the task it did to kill. +Enables a system-wide task dump (excluding kernel threads) to be produced +when the kernel performs an OOM-killing and includes such information as +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..7b22adaad4f1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); } } } @@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: free_page((unsigned long)pgd); out: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6396f88c6687..e6e0abeb5d12 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap; + unsigned long data, text, lib, swap, ptes, pmds; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE * sizeof(pte_t) * - atomic_long_read(&mm->nr_ptes)) >> 10, + ptes >> 10, + pmds >> 10, swap << (PAGE_SHIFT-10)); } diff --git a/include/linux/mm.h b/include/linux/mm.h index c6bf813a6b3d..644990b83cda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, { return 0; } + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} +static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} + #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_pmds); +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_pmds); +} + +static inline void mm_dec_nr_pmds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_pmds); +} #endif int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 20ff2105b564..199a03aab8dc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -363,7 +363,8 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ - atomic_long_t nr_ptes; /* Page table pages */ + atomic_long_t nr_ptes; /* PTE page table pages */ + atomic_long_t nr_pmds; /* PMD page table pages */ int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ diff --git a/kernel/fork.c b/kernel/fork.c index b379d9abddc7..c99098c52641 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); +#ifndef __PAGETABLE_PMD_FOLDED + atomic_long_set(&mm->nr_pmds, 0); +#endif mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd28d6ba5e5d..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { + mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); spin_lock(ptl); - if (pud_none(*pud)) + if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else + } else { put_page(virt_to_page(spte)); + mm_inc_nr_pmds(mm); + } spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); @@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) pud_clear(pud); put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } diff --git a/mm/memory.c b/mm/memory.c index d63849b5188f..bbe6a73a899d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) /* Another has populated it */ - pmd_free(mm, new); - else + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); pud_populate(mm, pud, new); -#else - if (pgd_present(*pud)) /* Another has populated it */ + } else /* Another has populated it */ pmd_free(mm, new); - else +#else + if (!pgd_present(*pud)) { + mm_inc_nr_pmds(mm); pgd_populate(mm, pud, new); + } else /* Another has populated it */ + pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index 14d84666e8ba..6a7d36d133fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT); + WARN_ON(mm_nr_pmds(mm) > + round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b8df76ee2be3..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + - get_mm_counter(p->mm, MM_SWAPENTS); + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); task_unlock(p); /* @@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), + mm_nr_pmds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit v1.2.3 From b30fe6c7ced70f62862c3d09357e7e8084e98d9f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:53 -0800 Subject: mm: fix false-positive warning on exit due mm_nr_pmds(mm) The problem is that we check nr_ptes/nr_pmds in exit_mmap() which happens *before* pgd_free(). And if an arch does pte/pmd allocation in pgd_alloc() and frees them in pgd_free() we see offset in counters by the time of the checks. We tried to workaround this by offsetting expected counter value according to FIRST_USER_ADDRESS for both nr_pte and nr_pmd in exit_mmap(). But it doesn't work in some cases: 1. ARM with LPAE enabled also has non-zero USER_PGTABLES_CEILING, but upper addresses occupied with huge pmd entries, so the trick with offsetting expected counter value will get really ugly: we will have to apply it nr_pmds, but not nr_ptes. 2. Metag has non-zero FIRST_USER_ADDRESS, but doesn't do allocation pte/pmd page tables allocation in pgd_alloc(), just setup a pgd entry which is allocated at boot and shared accross all processes. The proposal is to move the check to check_mm() which happens *after* pgd_free() and do proper accounting during pgd_alloc() and pgd_free() which would bring counters to zero if nothing leaked. Signed-off-by: Kirill A. Shutemov Reported-by: Tyler Baker Tested-by: Tyler Baker Tested-by: Nishanth Menon Cc: Russell King Cc: James Hogan Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 4 ++++ arch/unicore32/mm/pgd.c | 3 +++ kernel/fork.c | 8 ++++++++ mm/mmap.c | 5 ----- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 249379535be2..a3681f11dd9f 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -97,6 +97,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) no_pte: pmd_free(mm, new_pmd); + mm_dec_nr_pmds(mm); no_pmd: pud_free(mm, new_pud); no_pud: @@ -130,9 +131,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); + atomic_long_dec(&mm->nr_ptes); no_pmd: pud_clear(pud); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); no_pud: pgd_clear(pgd); pud_free(mm, pud); @@ -152,6 +155,7 @@ no_pgd: pmd = pmd_offset(pud, 0); pud_clear(pud); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); pgd_clear(pgd); pud_free(mm, pud); } diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index 08b8d4295e70..2ade20d8eab3 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -69,6 +69,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) no_pte: pmd_free(mm, new_pmd); + mm_dec_nr_pmds(mm); no_pmd: free_pages((unsigned long)new_pgd, 0); no_pgd: @@ -96,7 +97,9 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); + atomic_long_dec(&mm->nr_ptes); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); free: free_pages((unsigned long) pgd, 0); } diff --git a/kernel/fork.c b/kernel/fork.c index c99098c52641..66e19c251581 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -606,6 +606,14 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } + + if (atomic_long_read(&mm->nr_ptes)) + pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", + atomic_long_read(&mm->nr_ptes)); + if (mm_nr_pmds(mm)) + pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", + mm_nr_pmds(mm)); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif diff --git a/mm/mmap.c b/mm/mmap.c index 6a7d36d133fb..c5f44682c0d1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2851,11 +2851,6 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); } vm_unacct_memory(nr_accounted); - - WARN_ON(atomic_long_read(&mm->nr_ptes) > - round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT); - WARN_ON(mm_nr_pmds(mm) > - round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); } /* Insert vm structure into process list sorted by address -- cgit v1.2.3