From bdf2b8cbf076568f7d17dc62467348d409142298 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:54 +0900 Subject: selftests/ftrace: Add user-memory access syntax testcase Add a user-memory access syntax testcase which checks new user-memory access syntax and ustring type. Link: http://lkml.kernel.org/r/155789873385.26965.9557271156179140676.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/kprobe/kprobe_args_user.tc | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc new file mode 100644 index 000000000000..0f60087583d8 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event user-memory access + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +grep -q '\$arg' README || exit_unresolved # depends on arch +grep -A10 "fetcharg:" README | grep -q 'ustring' || exit_unsupported +grep -A10 "fetcharg:" README | grep -q '\[u\]' || exit_unsupported + +:;: "user-memory access syntax and ustring working on user memory";: +echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \ + > kprobe_events + +grep myevent kprobe_events | \ + grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string' +echo 1 > events/kprobes/myevent/enable +echo > /dev/null +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"' + +:;: "user-memory access syntax and ustring not working with kernel memory";: +echo 'p:myevent vfs_symlink path=+0($arg3):ustring path2=+u0($arg3):string' \ + > kprobe_events +echo 1 > events/kprobes/myevent/enable +ln -s foo $TMPDIR/bar +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path=(fault) path2=(fault)' + +exit 0 -- cgit v1.2.3 From 1e032f7cfa141b4424827b0ecb0ea899f84e182e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:39:05 +0900 Subject: perf-probe: Add user memory access attribute support Add user memory access attribute for kprobe event arguments. If a given 'local variable' is in user-space, User can specify memory access method by '@user' suffix. This is not only for string but also for data structure. If we access a field of data structure in user memory from kernel on some arch, it will fail. e.g. perf probe -a "sched_setscheduler param->sched_priority" This will fail to access the "param->sched_priority" because the param is __user pointer. Instead, we can now specify @user suffix for such argument. perf probe -a "sched_setscheduler param->sched_priority@user" Note that kernel memory access with "@user" must always fail on any arch. Link: http://lkml.kernel.org/r/155789874562.26965.10836126971405890891.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/perf/Documentation/perf-probe.txt | 3 ++- tools/perf/util/probe-event.c | 11 +++++++++++ tools/perf/util/probe-event.h | 2 ++ tools/perf/util/probe-file.c | 7 +++++++ tools/perf/util/probe-file.h | 1 + tools/perf/util/probe-finder.c | 19 ++++++++++++------- 6 files changed, 35 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index b6866a05edd2..ed3ecfa422e1 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -194,12 +194,13 @@ PROBE ARGUMENT -------------- Each probe argument follows below syntax. - [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE] + [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE][@user] 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.) '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters. 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo (*). Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal integers (x/x8/x16/x32/x64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail) On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid. +"@user" is a special attribute which means the LOCALVAR will be treated as a user-space memory. This is only valid for kprobe event. TYPES ----- diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 198e09ff611e..a7ca17be5fc5 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1577,6 +1577,17 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg) str = tmp + 1; } + tmp = strchr(str, '@'); + if (tmp && tmp != str && strcmp(tmp + 1, "user")) { /* user attr */ + if (!user_access_is_supported()) { + semantic_error("ftrace does not support user access\n"); + return -EINVAL; + } + *tmp = '\0'; + arg->user_access = true; + pr_debug("user_access "); + } + tmp = strchr(str, ':'); if (tmp) { /* Type setting */ *tmp = '\0'; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 05c8d571a901..96a319cd2378 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -37,6 +37,7 @@ struct probe_trace_point { struct probe_trace_arg_ref { struct probe_trace_arg_ref *next; /* Next reference */ long offset; /* Offset value */ + bool user_access; /* User-memory access */ }; /* kprobe-tracer and uprobe-tracer tracing argument */ @@ -82,6 +83,7 @@ struct perf_probe_arg { char *var; /* Variable name */ char *type; /* Type name */ struct perf_probe_arg_field *field; /* Structure fields */ + bool user_access; /* User-memory access */ }; /* Perf probe probing event (point + arg) */ diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index 4062bc4412a9..89ce1a9c3798 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -1015,6 +1015,7 @@ enum ftrace_readme { FTRACE_README_PROBE_TYPE_X = 0, FTRACE_README_KRETPROBE_OFFSET, FTRACE_README_UPROBE_REF_CTR, + FTRACE_README_USER_ACCESS, FTRACE_README_END, }; @@ -1027,6 +1028,7 @@ static struct { DEFINE_TYPE(FTRACE_README_PROBE_TYPE_X, "*type: * x8/16/32/64,*"), DEFINE_TYPE(FTRACE_README_KRETPROBE_OFFSET, "*place (kretprobe): *"), DEFINE_TYPE(FTRACE_README_UPROBE_REF_CTR, "*ref_ctr_offset*"), + DEFINE_TYPE(FTRACE_README_USER_ACCESS, "*[u]*"), }; static bool scan_ftrace_readme(enum ftrace_readme type) @@ -1087,3 +1089,8 @@ bool uprobe_ref_ctr_is_supported(void) { return scan_ftrace_readme(FTRACE_README_UPROBE_REF_CTR); } + +bool user_access_is_supported(void) +{ + return scan_ftrace_readme(FTRACE_README_USER_ACCESS); +} diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h index 2a249182f2a6..986c1c94f64f 100644 --- a/tools/perf/util/probe-file.h +++ b/tools/perf/util/probe-file.h @@ -70,6 +70,7 @@ int probe_cache__show_all_caches(struct strfilter *filter); bool probe_type_is_available(enum probe_type type); bool kretprobe_offset_is_supported(void); bool uprobe_ref_ctr_is_supported(void); +bool user_access_is_supported(void); #else /* ! HAVE_LIBELF_SUPPORT */ static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused, struct nsinfo *nsi __maybe_unused) { diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c37fbef1711d..c202027716d0 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -294,7 +294,7 @@ static_var: static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, - const char *cast) + const char *cast, bool user_access) { struct probe_trace_arg_ref **ref_ptr = &tvar->ref; Dwarf_Die type; @@ -334,7 +334,8 @@ static int convert_variable_type(Dwarf_Die *vr_die, pr_debug("%s type is %s.\n", dwarf_diename(vr_die), dwarf_diename(&type)); - if (cast && strcmp(cast, "string") == 0) { /* String type */ + if (cast && (!strcmp(cast, "string") || !strcmp(cast, "ustring"))) { + /* String type */ ret = dwarf_tag(&type); if (ret != DW_TAG_pointer_type && ret != DW_TAG_array_type) { @@ -357,6 +358,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, pr_warning("Out of memory error\n"); return -ENOMEM; } + (*ref_ptr)->user_access = user_access; } if (!die_compare_name(&type, "char") && !die_compare_name(&type, "unsigned char")) { @@ -411,7 +413,7 @@ formatted: static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, struct perf_probe_arg_field *field, struct probe_trace_arg_ref **ref_ptr, - Dwarf_Die *die_mem) + Dwarf_Die *die_mem, bool user_access) { struct probe_trace_arg_ref *ref = *ref_ptr; Dwarf_Die type; @@ -448,6 +450,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, *ref_ptr = ref; } ref->offset += dwarf_bytesize(&type) * field->index; + ref->user_access = user_access; goto next; } else if (tag == DW_TAG_pointer_type) { /* Check the pointer and dereference */ @@ -519,17 +522,18 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, } } ref->offset += (long)offs; + ref->user_access = user_access; /* If this member is unnamed, we need to reuse this field */ if (!dwarf_diename(die_mem)) return convert_variable_fields(die_mem, varname, field, - &ref, die_mem); + &ref, die_mem, user_access); next: /* Converting next field */ if (field->next) return convert_variable_fields(die_mem, field->name, - field->next, &ref, die_mem); + field->next, &ref, die_mem, user_access); else return 0; } @@ -555,11 +559,12 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf) else if (ret == 0 && pf->pvar->field) { ret = convert_variable_fields(vr_die, pf->pvar->var, pf->pvar->field, &pf->tvar->ref, - &die_mem); + &die_mem, pf->pvar->user_access); vr_die = &die_mem; } if (ret == 0) - ret = convert_variable_type(vr_die, pf->tvar, pf->pvar->type); + ret = convert_variable_type(vr_die, pf->tvar, pf->pvar->type, + pf->pvar->user_access); /* *expr will be cached in libdw. Don't free it. */ return ret; } -- cgit v1.2.3 From 8a5e0af240e07dd3d4897eb8ff52aab757da7fab Mon Sep 17 00:00:00 2001 From: Alan Mikhak Date: Thu, 23 May 2019 14:18:00 -0700 Subject: tools: PCI: Fix broken pcitest compilation pcitest is currently broken due to the following compiler error and related warning. Fix by changing the run_test() function signature to return an integer result. pcitest.c: In function run_test: pcitest.c:143:9: warning: return with a value, in function returning void return (ret < 0) ? ret : 1 - ret; /* return 0 if test succeeded */ pcitest.c: In function main: pcitest.c:232:9: error: void value not ignored as it ought to be return run_test(test); Fixes: fef31ecaaf2c ("tools: PCI: Fix compilation warnings") Signed-off-by: Alan Mikhak Signed-off-by: Lorenzo Pieralisi Reviewed-by: Paul Walmsley --- tools/pci/pcitest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c index 5fa5c2bdd427..6dce894667f6 100644 --- a/tools/pci/pcitest.c +++ b/tools/pci/pcitest.c @@ -47,15 +47,15 @@ struct pci_test { unsigned long size; }; -static void run_test(struct pci_test *test) +static int run_test(struct pci_test *test) { - long ret; + int ret = -EINVAL; int fd; fd = open(test->device, O_RDWR); if (fd < 0) { perror("can't open PCI Endpoint Test device"); - return; + return -ENODEV; } if (test->barnum >= 0 && test->barnum <= 5) { -- cgit v1.2.3 From 81cb4203a5fec7d3f8cf4f145e8e7077b2dcbc78 Mon Sep 17 00:00:00 2001 From: Alan Mikhak Date: Thu, 23 May 2019 14:18:01 -0700 Subject: tools: PCI: Fix compiler warning in pcitest Fix the following compiler warning in pcitest: pcitest.c: In function main: pcitest.c:214:4: warning: too many arguments for format [-Wformat-extra-args] "usage: %s [options]\n" Fixes: fbca0b284bd0 ("tools: PCI: Add 'h' in optstring of getopt()") Signed-off-by: Alan Mikhak Signed-off-by: Lorenzo Pieralisi Reviewed-by: Paul Walmsley --- tools/pci/pcitest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c index 6dce894667f6..6f1303104d84 100644 --- a/tools/pci/pcitest.c +++ b/tools/pci/pcitest.c @@ -223,7 +223,7 @@ usage: "\t-r Read buffer test\n" "\t-w Write buffer test\n" "\t-c Copy buffer test\n" - "\t-s Size of buffer {default: 100KB}\n", + "\t-s Size of buffer {default: 100KB}\n" "\t-h Print this help message\n", argv[0]); return -EINVAL; -- cgit v1.2.3 From 76ab785e7396796118016c8bc9fd62b58fe422ea Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:58 -0600 Subject: NTB: Add ntb_msi_test support to ntb_test When the ntb_msi_test module is available, the test code will trigger each of the interrupts and ensure the corresponding occurrences files gets incremented. Signed-off-by: Logan Gunthorpe Cc: Dave Jiang Cc: Allen Hubbe Signed-off-by: Jon Mason --- tools/testing/selftests/ntb/ntb_test.sh | 54 +++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh index 08cbfbbc7029..7df497095b72 100755 --- a/tools/testing/selftests/ntb/ntb_test.sh +++ b/tools/testing/selftests/ntb/ntb_test.sh @@ -87,10 +87,10 @@ set -e function _modprobe() { - modprobe "$@" + modprobe "$@" || return 1 if [[ "$REMOTE_HOST" != "" ]]; then - ssh "$REMOTE_HOST" modprobe "$@" + ssh "$REMOTE_HOST" modprobe "$@" || return 1 fi } @@ -451,6 +451,30 @@ function pingpong_test() echo " Passed" } +function msi_test() +{ + LOC=$1 + REM=$2 + + write_file 1 $LOC/ready + + echo "Running MSI interrupt tests on: $(subdirname $LOC) / $(subdirname $REM)" + + CNT=$(read_file "$LOC/count") + for ((i = 0; i < $CNT; i++)); do + START=$(read_file $REM/../irq${i}_occurrences) + write_file $i $LOC/trigger + END=$(read_file $REM/../irq${i}_occurrences) + + if [[ $(($END - $START)) != 1 ]]; then + echo "MSI did not trigger the interrupt on the remote side!" >&2 + exit 1 + fi + done + + echo " Passed" +} + function perf_test() { USE_DMA=$1 @@ -529,6 +553,29 @@ function ntb_pingpong_tests() _modprobe -r ntb_pingpong } +function ntb_msi_tests() +{ + LOCAL_MSI="$DEBUGFS/ntb_msi_test/$LOCAL_DEV" + REMOTE_MSI="$REMOTE_HOST:$DEBUGFS/ntb_msi_test/$REMOTE_DEV" + + echo "Starting ntb_msi_test tests..." + + if ! _modprobe ntb_msi_test 2> /dev/null; then + echo " Not doing MSI tests seeing the module is not available." + return + fi + + port_test $LOCAL_MSI $REMOTE_MSI + + LOCAL_PEER="$LOCAL_MSI/peer$LOCAL_PIDX" + REMOTE_PEER="$REMOTE_MSI/peer$REMOTE_PIDX" + + msi_test $LOCAL_PEER $REMOTE_PEER + msi_test $REMOTE_PEER $LOCAL_PEER + + _modprobe -r ntb_msi_test +} + function ntb_perf_tests() { LOCAL_PERF="$DEBUGFS/ntb_perf/$LOCAL_DEV" @@ -550,6 +597,7 @@ function cleanup() _modprobe -r ntb_perf 2> /dev/null _modprobe -r ntb_pingpong 2> /dev/null _modprobe -r ntb_transport 2> /dev/null + _modprobe -r ntb_msi_test 2> /dev/null set -e } @@ -586,5 +634,7 @@ ntb_tool_tests echo ntb_pingpong_tests echo +ntb_msi_tests +echo ntb_perf_tests echo -- cgit v1.2.3 From 8d0f1e05ab16c4bd628ddaefd20b94ffb36d799c Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Mon, 17 Jun 2019 17:24:58 -0400 Subject: selftests/powerpc: Fix earlyclobber in tm-vmxcopy In some cases, compiler can allocate the same register for operand 'res' and 'vecoutptr', resulting in segfault at 'stxvd2x 40,0,%[vecoutptr]' because base register will contain 1, yielding a false-positive. This is because output 'res' must be marked as an earlyclobber operand so it may not overlap an input operand ('vecoutptr'). Signed-off-by: Gustavo Romero Reviewed-by: Segher Boessenkool Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/tm/tm-vmxcopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c index fe52811584ae..0815f06f3590 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c @@ -79,7 +79,7 @@ int test_vmxcopy() "5:;" "stxvd2x 40,0,%[vecoutptr];" - : [res]"=r"(aborted) + : [res]"=&r"(aborted) : [vecinptr]"r"(&vecin), [vecoutptr]"r"(&vecout), [map]"r"(a) -- cgit v1.2.3 From 6820e565d35084bb9d3a167e990fcb780ccd716f Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Sun, 23 Jun 2019 18:52:00 +0300 Subject: selftests/powerpc: ppc_asm.h: typo in the header guard The guard macro __PPC_ASM_H in the header ppc_asm.h doesn't match the #ifndef macro _PPC_ASM_H. The patch makes them the same. Signed-off-by: Denis Efremov Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h index d2c0a911f55e..2b488b78c4f2 100644 --- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h +++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PPC_ASM_H -#define __PPC_ASM_H +#define _PPC_ASM_H #include #ifndef r1 -- cgit v1.2.3 From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:08 +0200 Subject: memremap: move dev_pagemap callbacks into a separate structure The dev_pagemap is a growing too many callbacks. Move them into a separate ops structure so that they are not duplicated for multiple instances, and an attacker can't easily overwrite them. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 11 +++++++---- drivers/dax/pmem/core.c | 2 +- drivers/nvdimm/pmem.c | 19 +++++++++++-------- drivers/pci/p2pdma.c | 8 ++++++-- include/linux/memremap.h | 36 ++++++++++++++++++++---------------- kernel/memremap.c | 18 +++++++++--------- mm/hmm.c | 10 +++++++--- tools/testing/nvdimm/test/iomap.c | 7 ++++--- 8 files changed, 65 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 79014baa782d..f390083a64d7 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref) percpu_ref_exit(ref); } -static void dev_dax_percpu_kill(struct percpu_ref *data) +static void dev_dax_percpu_kill(struct percpu_ref *ref) { - struct percpu_ref *ref = data; struct dev_dax *dev_dax = ref_to_dev_dax(ref); dev_dbg(&dev_dax->dev, "%s\n", __func__); @@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static const struct dev_pagemap_ops dev_dax_pagemap_ops = { + .kill = dev_dax_percpu_kill, + .cleanup = dev_dax_percpu_exit, +}; + int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev) return rc; dev_dax->pgmap.ref = &dev_dax->ref; - dev_dax->pgmap.kill = dev_dax_percpu_kill; - dev_dax->pgmap.cleanup = dev_dax_percpu_exit; dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; + dev_dax->pgmap.ops = &dev_dax_pagemap_ops; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index f9f51786d556..6eb6dfdf19bf 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) struct dev_dax *dev_dax; struct nd_namespace_io *nsio; struct dax_region *dax_region; - struct dev_pagemap pgmap = { 0 }; + struct dev_pagemap pgmap = { }; struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 24d7fe7c74ed..c2449af2b388 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void __pmem_release_queue(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct percpu_ref *ref) { struct request_queue *q; @@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref) static void pmem_release_queue(void *ref) { - __pmem_release_queue(ref); + pmem_pagemap_cleanup(ref); } -static void pmem_freeze_queue(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct percpu_ref *ref) { struct request_queue *q; @@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap) dev_pagemap_put_ops(); } -static void fsdax_pagefree(struct page *page, void *data) +static void pmem_pagemap_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); } +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .page_free = pmem_pagemap_page_free, + .kill = pmem_pagemap_kill, + .cleanup = pmem_pagemap_cleanup, +}; + static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) { dev_pagemap_get_ops(); if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) return -ENOMEM; pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->page_free = fsdax_pagefree; - + pgmap->ops = &fsdax_pagemap_ops; return 0; } @@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; - pmem->pgmap.kill = pmem_freeze_queue; - pmem->pgmap.cleanup = __pmem_release_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index a4994aa3acc0..fb039259d463 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -153,6 +153,11 @@ out: return error; } +static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = { + .kill = pci_p2pdma_percpu_kill, + .cleanup = pci_p2pdma_percpu_cleanup, +}; + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); - pgmap->kill = pci_p2pdma_percpu_kill; - pgmap->cleanup = pci_p2pdma_percpu_cleanup; + pgmap->ops = &pci_p2pdma_pagemap_ops; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0c86f2c5ac9c..919755f48c7e 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -63,41 +63,45 @@ enum memory_type { MEMORY_DEVICE_PCI_P2PDMA, }; -/* - * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief - * explanation in include/linux/memory_hotplug.h. - * - * The page_free() callback is called once the page refcount reaches 1 - * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. - * This allows the device driver to implement its own memory management.) - */ -typedef void (*dev_page_free_t)(struct page *page, void *data); +struct dev_pagemap_ops { + /* + * Called once the page refcount reaches 1. (ZONE_DEVICE pages never + * reach 0 refcount unless there is a refcount bug. This allows the + * device driver to implement its own memory management.) + */ + void (*page_free)(struct page *page, void *data); + + /* + * Transition the refcount in struct dev_pagemap to the dead state. + */ + void (*kill)(struct percpu_ref *ref); + + /* + * Wait for refcount in struct dev_pagemap to be idle and reap it. + */ + void (*cleanup)(struct percpu_ref *ref); +}; /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping - * @kill: callback to transition @ref to the dead state - * @cleanup: callback to wait for @ref to be idle and reap it * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @ops: method table */ struct dev_pagemap { - dev_page_free_t page_free; struct vmem_altmap altmap; bool altmap_valid; struct resource res; struct percpu_ref *ref; - void (*kill)(struct percpu_ref *ref); - void (*cleanup)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; + const struct dev_pagemap_ops *ops; }; #ifdef CONFIG_ZONE_DEVICE diff --git a/kernel/memremap.c b/kernel/memremap.c index abda62d1e5a3..0824237ef979 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data) * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized - * by the caller before passing it to this function + * 1/ At a minimum the res, ref and type and ops members of @pgmap must be + * initialized by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true @@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); } @@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 48574f8485bb..583a02a16872 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } +static const struct dev_pagemap_ops hmm_pagemap_ops = { + .page_free = hmm_devmem_free, + .kill = hmm_devmem_ref_kill, + .cleanup = hmm_devmem_ref_exit, +}; + /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 076df22e4bda..cf3f064a697d 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup); - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); + WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup); + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:09 +0200 Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup Passing the actual typed structure leads to more understandable code vs just passing the ref member. Reported-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 12 ++++++------ drivers/nvdimm/pmem.c | 18 +++++++++--------- drivers/pci/p2pdma.c | 9 +++++---- include/linux/memremap.h | 4 ++-- kernel/memremap.c | 8 ++++---- mm/hmm.c | 10 +++++----- tools/testing/nvdimm/test/iomap.c | 4 ++-- 7 files changed, 33 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index f390083a64d7..b5257038c188 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref) complete(&dev_dax->cmp); } -static void dev_dax_percpu_exit(struct percpu_ref *ref) +static void dev_dax_percpu_exit(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); wait_for_completion(&dev_dax->cmp); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void dev_dax_percpu_kill(struct percpu_ref *ref) +static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c2449af2b388..9dac48359353 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void pmem_pagemap_cleanup(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_cleanup_queue(q); } -static void pmem_release_queue(void *ref) +static void pmem_release_queue(void *pgmap) { - pmem_pagemap_cleanup(ref); + pmem_pagemap_cleanup(pgmap); } -static void pmem_pagemap_kill(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_freeze_queue_start(q); } @@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev, memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); } else { if (devm_add_action_or_reset(dev, pmem_release_queue, - &q->q_usage_counter)) + &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index fb039259d463..fa6249e4ed5f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref) complete(&p2p_pgmap->ref_done); } -static void pci_p2pdma_percpu_kill(struct percpu_ref *ref) +static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } -static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref) +static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap) { - struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref); + struct p2pdma_pagemap *p2p_pgmap = + container_of(pgmap, struct p2pdma_pagemap, pgmap); wait_for_completion(&p2p_pgmap->ref_done); percpu_ref_exit(&p2p_pgmap->ref); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 919755f48c7e..b8666a0d8665 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -74,12 +74,12 @@ struct dev_pagemap_ops { /* * Transition the refcount in struct dev_pagemap to the dead state. */ - void (*kill)(struct percpu_ref *ref); + void (*kill)(struct dev_pagemap *pgmap); /* * Wait for refcount in struct dev_pagemap to be idle and reap it. */ - void (*cleanup)(struct percpu_ref *ref); + void (*cleanup)(struct dev_pagemap *pgmap); }; /** diff --git a/kernel/memremap.c b/kernel/memremap.c index 0824237ef979..00c1ceb60c19 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/mm/hmm.c b/mm/hmm.c index 583a02a16872..987793fba923 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref) complete(&devmem->completion); } -static void hmm_devmem_ref_exit(struct percpu_ref *ref) +static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap) { struct hmm_devmem *devmem; - devmem = container_of(ref, struct hmm_devmem, ref); + devmem = container_of(pgmap, struct hmm_devmem, pagemap); wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void hmm_devmem_ref_kill(struct percpu_ref *ref) +static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index cf3f064a697d..82f901569e06 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap) WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:14 +0200 Subject: memremap: provide an optional internal refcount in struct dev_pagemap Provide an internal refcounting logic if no ->ref field is provided in the pagemap passed into devm_memremap_pages so that callers don't have to reinvent it poorly. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- include/linux/memremap.h | 4 +++ kernel/memremap.c | 64 +++++++++++++++++++++++++++++++-------- tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++-------- 3 files changed, 101 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e25685b878e9..f8a5b2a19945 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -95,6 +95,8 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping + * @internal_ref: internal reference if @ref is not provided by the caller + * @done: completion for @internal_ref * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -105,6 +107,8 @@ struct dev_pagemap { struct vmem_altmap altmap; struct resource res; struct percpu_ref *ref; + struct percpu_ref internal_ref; + struct completion done; struct device *dev; enum memory_type type; unsigned int flags; diff --git a/kernel/memremap.c b/kernel/memremap.c index eee490e7d7e1..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data) static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) { - if (!pgmap->ops->page_free) { + if (!pgmap->ops || !pgmap->ops->page_free) { WARN(1, "Missing page_free method\n"); return -EINVAL; } @@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type and ops members of @pgmap must be - * initialized by the caller before passing it to this function + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } if (need_devmap_managed) { @@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 82f901569e06..cd040b5abffe 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + WARN_ON(!pgmap || !pgmap->ref); + + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); + + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { + int error; resource_size_t offset = pgmap->res.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); - if (nfit_res) { - int rc; - - rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); - if (rc) - return ERR_PTR(rc); - return nfit_res->buf + offset - nfit_res->res.start; + if (!nfit_res) + return devm_memremap_pages(dev, pgmap); + + pgmap->dev = dev; + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } - return devm_memremap_pages(dev, pgmap); + + error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); + if (error) + return ERR_PTR(error); + return nfit_res->buf + offset - nfit_res->res.start; } EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); -- cgit v1.2.3 From 7b570361f6f66c91443541b19121038c076e7d64 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 17 Jun 2019 16:52:04 +0200 Subject: selftests/powerpc: Add missing newline at end of file "git diff" says: \ No newline at end of file after modifying the file. Signed-off-by: Geert Uytterhoeven [mpe: Rebase since addition of another test] Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/mm/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index d503b8764a8e..7101ffd08d66 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -4,4 +4,4 @@ tempfile prot_sao segv_errors wild_bctr -large_vm_fork_separation \ No newline at end of file +large_vm_fork_separation -- cgit v1.2.3 From 3fb4f7cd472c7f5905c91508e988f6b28372210d Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 30 Jun 2019 10:14:08 -0700 Subject: tools/power/x86: A tool to validate Intel Speed Select commands The Intel(R) Speed select technologies contains four features. Performance profile:An non architectural mechanism that allows multiple optimized performance profiles per system via static and/or dynamic adjustment of core count, workload, Tjmax, and TDP, etc. aka ISS in the documentation. Base Frequency: Enables users to increase guaranteed base frequency on certain cores (high priority cores) in exchange for lower base frequency on remaining cores (low priority cores). aka PBF in the documenation. Turbo frequency: Enables the ability to set different turbo ratio limits to cores based on priority. aka FACT in the documentation. Core power: An Interface that allows user to define per core/tile priority. There is a multi level help for commands and options. This can be used to check required arguments for each feature and commands for the feature. To start navigating the features start with $sudo intel-speed-select --help For help on a specific feature for example $sudo intel-speed-select perf-profile --help To get help for a command for a feature for example $sudo intel-speed-select perf-profile get-lock-status --help Signed-off-by: Srinivas Pandruvada Acked-by: Len Brown Acked-by: Rafael J. Wysocki Signed-off-by: Andy Shevchenko --- tools/Makefile | 12 +- tools/power/x86/intel-speed-select/Build | 1 + tools/power/x86/intel-speed-select/Makefile | 56 + tools/power/x86/intel-speed-select/isst-config.c | 1607 +++++++++++++++++++++ tools/power/x86/intel-speed-select/isst-core.c | 721 +++++++++ tools/power/x86/intel-speed-select/isst-display.c | 479 ++++++ tools/power/x86/intel-speed-select/isst.h | 231 +++ 7 files changed, 3102 insertions(+), 5 deletions(-) create mode 100644 tools/power/x86/intel-speed-select/Build create mode 100644 tools/power/x86/intel-speed-select/Makefile create mode 100644 tools/power/x86/intel-speed-select/isst-config.c create mode 100644 tools/power/x86/intel-speed-select/isst-core.c create mode 100644 tools/power/x86/intel-speed-select/isst-display.c create mode 100644 tools/power/x86/intel-speed-select/isst.h (limited to 'tools') diff --git a/tools/Makefile b/tools/Makefile index 3dfd72ae6c1a..68defd7ecf5d 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -19,6 +19,7 @@ help: @echo ' gpio - GPIO tools' @echo ' hv - tools used when in Hyper-V clients' @echo ' iio - IIO tools' + @echo ' intel-speed-select - Intel Speed Select tool' @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' @echo ' liblockdep - user-space wrapper for kernel locking-validator' @@ -82,7 +83,7 @@ perf: FORCE selftests: FORCE $(call descend,testing/$@) -turbostat x86_energy_perf_policy: FORCE +turbostat x86_energy_perf_policy intel-speed-select: FORCE $(call descend,power/x86/$@) tmon: FORCE @@ -115,7 +116,7 @@ liblockdep_install: selftests_install: $(call descend,testing/$(@:_install=),install) -turbostat_install x86_energy_perf_policy_install: +turbostat_install x86_energy_perf_policy_install intel-speed-select_install: $(call descend,power/x86/$(@:_install=),install) tmon_install: @@ -132,7 +133,7 @@ install: acpi_install cgroup_install cpupower_install gpio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install vm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ - wmi_install pci_install debugging_install + wmi_install pci_install debugging_install intel-speed-select_install acpi_clean: $(call descend,power/acpi,clean) @@ -162,7 +163,7 @@ perf_clean: selftests_clean: $(call descend,testing/$(@:_clean=),clean) -turbostat_clean x86_energy_perf_policy_clean: +turbostat_clean x86_energy_perf_policy_clean intel-speed-select_clean: $(call descend,power/x86/$(@:_clean=),clean) tmon_clean: @@ -178,6 +179,7 @@ clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \ - gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean + gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ + intel-speed-select_clean .PHONY: FORCE diff --git a/tools/power/x86/intel-speed-select/Build b/tools/power/x86/intel-speed-select/Build new file mode 100644 index 000000000000..b61456d75190 --- /dev/null +++ b/tools/power/x86/intel-speed-select/Build @@ -0,0 +1 @@ +intel-speed-select-y += isst-config.o isst-core.o isst-display.o diff --git a/tools/power/x86/intel-speed-select/Makefile b/tools/power/x86/intel-speed-select/Makefile new file mode 100644 index 000000000000..12c6939dca2a --- /dev/null +++ b/tools/power/x86/intel-speed-select/Makefile @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../../../scripts/Makefile.include + +bindir ?= /usr/bin + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := intel-speed-select +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) + +all: $(ALL_PROGRAMS) + +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +# +# We need the following to be outside of kernel tree +# +$(OUTPUT)include/linux/isst_if.h: ../../../../include/uapi/linux/isst_if.h + mkdir -p $(OUTPUT)include/linux 2>&1 || true + ln -sf $(CURDIR)/../../../../include/uapi/linux/isst_if.h $@ + +prepare: $(OUTPUT)include/linux/isst_if.h + +ISST_IN := $(OUTPUT)intel-speed-select-in.o + +$(ISST_IN): prepare FORCE + $(Q)$(MAKE) $(build)=intel-speed-select +$(OUTPUT)intel-speed-select: $(ISST_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +clean: + rm -f $(ALL_PROGRAMS) + rm -rf $(OUTPUT)include/linux/isst_if.h + find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + +install: $(ALL_PROGRAMS) + install -d -m 755 $(DESTDIR)$(bindir); \ + for program in $(ALL_PROGRAMS); do \ + install $$program $(DESTDIR)$(bindir); \ + done + +FORCE: + +.PHONY: all install clean FORCE prepare diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c new file mode 100644 index 000000000000..91c5ad1685a1 --- /dev/null +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -0,0 +1,1607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Speed Select -- Enumerate and control features + * Copyright (c) 2019 Intel Corporation. + */ + +#include + +#include "isst.h" + +struct process_cmd_struct { + char *feature; + char *command; + void (*process_fn)(void); +}; + +static const char *version_str = "v1.0"; +static const int supported_api_ver = 1; +static struct isst_if_platform_info isst_platform_info; +static char *progname; +static int debug_flag; +static FILE *outf; + +static int cpu_model; + +#define MAX_CPUS_IN_ONE_REQ 64 +static short max_target_cpus; +static unsigned short target_cpus[MAX_CPUS_IN_ONE_REQ]; + +static int topo_max_cpus; +static size_t present_cpumask_size; +static cpu_set_t *present_cpumask; +static size_t target_cpumask_size; +static cpu_set_t *target_cpumask; +static int tdp_level = 0xFF; +static int fact_bucket = 0xFF; +static int fact_avx = 0xFF; +static unsigned long long fact_trl; +static int out_format_json; +static int cmd_help; + +/* clos related */ +static int current_clos = -1; +static int clos_epp = -1; +static int clos_prop_prio = -1; +static int clos_min = -1; +static int clos_max = -1; +static int clos_desired = -1; +static int clos_priority_type; + +struct _cpu_map { + unsigned short core_id; + unsigned short pkg_id; + unsigned short die_id; + unsigned short punit_cpu; + unsigned short punit_cpu_core; +}; +struct _cpu_map *cpu_map; + +void debug_printf(const char *format, ...) +{ + va_list args; + + va_start(args, format); + + if (debug_flag) + vprintf(format, args); + + va_end(args); +} + +static void update_cpu_model(void) +{ + unsigned int ebx, ecx, edx; + unsigned int fms, family; + + __cpuid(1, fms, ebx, ecx, edx); + family = (fms >> 8) & 0xf; + cpu_model = (fms >> 4) & 0xf; + if (family == 6 || family == 0xf) + cpu_model += ((fms >> 16) & 0xf) << 4; +} + +/* Open a file, and exit on failure */ +static FILE *fopen_or_exit(const char *path, const char *mode) +{ + FILE *filep = fopen(path, mode); + + if (!filep) + err(1, "%s: open failed", path); + + return filep; +} + +/* Parse a file containing a single int */ +static int parse_int_file(int fatal, const char *fmt, ...) +{ + va_list args; + char path[PATH_MAX]; + FILE *filep; + int value; + + va_start(args, fmt); + vsnprintf(path, sizeof(path), fmt, args); + va_end(args); + if (fatal) { + filep = fopen_or_exit(path, "r"); + } else { + filep = fopen(path, "r"); + if (!filep) + return -1; + } + if (fscanf(filep, "%d", &value) != 1) + err(1, "%s: failed to parse number from file", path); + fclose(filep); + + return value; +} + +int cpufreq_sysfs_present(void) +{ + DIR *dir; + + dir = opendir("/sys/devices/system/cpu/cpu0/cpufreq"); + if (dir) { + closedir(dir); + return 1; + } + + return 0; +} + +int out_format_is_json(void) +{ + return out_format_json; +} + +int get_physical_package_id(int cpu) +{ + return parse_int_file( + 1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", + cpu); +} + +int get_physical_core_id(int cpu) +{ + return parse_int_file( + 1, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); +} + +int get_physical_die_id(int cpu) +{ + int ret; + + ret = parse_int_file(0, "/sys/devices/system/cpu/cpu%d/topology/die_id", + cpu); + if (ret < 0) + ret = 0; + + return ret; +} + +int get_topo_max_cpus(void) +{ + return topo_max_cpus; +} + +#define MAX_PACKAGE_COUNT 8 +#define MAX_DIE_PER_PACKAGE 2 +static void for_each_online_package_in_set(void (*callback)(int, void *, void *, + void *, void *), + void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int max_packages[MAX_PACKAGE_COUNT * MAX_PACKAGE_COUNT]; + int pkg_index = 0, i; + + memset(max_packages, 0xff, sizeof(max_packages)); + for (i = 0; i < topo_max_cpus; ++i) { + int j, online, pkg_id, die_id = 0, skip = 0; + + if (!CPU_ISSET_S(i, present_cpumask_size, present_cpumask)) + continue; + if (i) + online = parse_int_file( + 1, "/sys/devices/system/cpu/cpu%d/online", i); + else + online = + 1; /* online entry for CPU 0 needs some special configs */ + + die_id = get_physical_die_id(i); + if (die_id < 0) + die_id = 0; + pkg_id = get_physical_package_id(i); + /* Create an unique id for package, die combination to store */ + pkg_id = (MAX_PACKAGE_COUNT * pkg_id + die_id); + + for (j = 0; j < pkg_index; ++j) { + if (max_packages[j] == pkg_id) { + skip = 1; + break; + } + } + + if (!skip && online && callback) { + callback(i, arg1, arg2, arg3, arg4); + max_packages[pkg_index++] = pkg_id; + } + } +} + +static void for_each_online_target_cpu_in_set( + void (*callback)(int, void *, void *, void *, void *), void *arg1, + void *arg2, void *arg3, void *arg4) +{ + int i; + + for (i = 0; i < topo_max_cpus; ++i) { + int online; + + if (!CPU_ISSET_S(i, target_cpumask_size, target_cpumask)) + continue; + if (i) + online = parse_int_file( + 1, "/sys/devices/system/cpu/cpu%d/online", i); + else + online = + 1; /* online entry for CPU 0 needs some special configs */ + + if (online && callback) + callback(i, arg1, arg2, arg3, arg4); + } +} + +#define BITMASK_SIZE 32 +static void set_max_cpu_num(void) +{ + FILE *filep; + unsigned long dummy; + + topo_max_cpus = 0; + filep = fopen_or_exit( + "/sys/devices/system/cpu/cpu0/topology/thread_siblings", "r"); + while (fscanf(filep, "%lx,", &dummy) == 1) + topo_max_cpus += BITMASK_SIZE; + fclose(filep); + topo_max_cpus--; /* 0 based */ + + debug_printf("max cpus %d\n", topo_max_cpus); +} + +size_t alloc_cpu_set(cpu_set_t **cpu_set) +{ + cpu_set_t *_cpu_set; + size_t size; + + _cpu_set = CPU_ALLOC((topo_max_cpus + 1)); + if (_cpu_set == NULL) + err(3, "CPU_ALLOC"); + size = CPU_ALLOC_SIZE((topo_max_cpus + 1)); + CPU_ZERO_S(size, _cpu_set); + + *cpu_set = _cpu_set; + return size; +} + +void free_cpu_set(cpu_set_t *cpu_set) +{ + CPU_FREE(cpu_set); +} + +static int cpu_cnt[MAX_PACKAGE_COUNT][MAX_DIE_PER_PACKAGE]; +static void set_cpu_present_cpu_mask(void) +{ + size_t size; + DIR *dir; + int i; + + size = alloc_cpu_set(&present_cpumask); + present_cpumask_size = size; + for (i = 0; i < topo_max_cpus; ++i) { + char buffer[256]; + + snprintf(buffer, sizeof(buffer), + "/sys/devices/system/cpu/cpu%d", i); + dir = opendir(buffer); + if (dir) { + int pkg_id, die_id; + + CPU_SET_S(i, size, present_cpumask); + die_id = get_physical_die_id(i); + if (die_id < 0) + die_id = 0; + + pkg_id = get_physical_package_id(i); + if (pkg_id < MAX_PACKAGE_COUNT && + die_id < MAX_DIE_PER_PACKAGE) + cpu_cnt[pkg_id][die_id]++; + } + closedir(dir); + } +} + +int get_cpu_count(int pkg_id, int die_id) +{ + if (pkg_id < MAX_PACKAGE_COUNT && die_id < MAX_DIE_PER_PACKAGE) + return cpu_cnt[pkg_id][die_id] + 1; + + return 0; +} + +static void set_cpu_target_cpu_mask(void) +{ + size_t size; + int i; + + size = alloc_cpu_set(&target_cpumask); + target_cpumask_size = size; + for (i = 0; i < max_target_cpus; ++i) { + if (!CPU_ISSET_S(target_cpus[i], present_cpumask_size, + present_cpumask)) + continue; + + CPU_SET_S(target_cpus[i], size, target_cpumask); + } +} + +static void create_cpu_map(void) +{ + const char *pathname = "/dev/isst_interface"; + int i, fd = 0; + struct isst_if_cpu_maps map; + + cpu_map = malloc(sizeof(*cpu_map) * topo_max_cpus); + if (!cpu_map) + err(3, "cpumap"); + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + for (i = 0; i < topo_max_cpus; ++i) { + if (!CPU_ISSET_S(i, present_cpumask_size, present_cpumask)) + continue; + + map.cmd_count = 1; + map.cpu_map[0].logical_cpu = i; + + debug_printf(" map logical_cpu:%d\n", + map.cpu_map[0].logical_cpu); + if (ioctl(fd, ISST_IF_GET_PHY_ID, &map) == -1) { + perror("ISST_IF_GET_PHY_ID"); + fprintf(outf, "Error: map logical_cpu:%d\n", + map.cpu_map[0].logical_cpu); + continue; + } + cpu_map[i].core_id = get_physical_core_id(i); + cpu_map[i].pkg_id = get_physical_package_id(i); + cpu_map[i].die_id = get_physical_die_id(i); + cpu_map[i].punit_cpu = map.cpu_map[0].physical_cpu; + cpu_map[i].punit_cpu_core = (map.cpu_map[0].physical_cpu >> + 1); // shift to get core id + + debug_printf( + "map logical_cpu:%d core: %d die:%d pkg:%d punit_cpu:%d punit_core:%d\n", + i, cpu_map[i].core_id, cpu_map[i].die_id, + cpu_map[i].pkg_id, cpu_map[i].punit_cpu, + cpu_map[i].punit_cpu_core); + } + + if (fd) + close(fd); +} + +int find_logical_cpu(int pkg_id, int die_id, int punit_core_id) +{ + int i; + + for (i = 0; i < topo_max_cpus; ++i) { + if (cpu_map[i].pkg_id == pkg_id && + cpu_map[i].die_id == die_id && + cpu_map[i].punit_cpu_core == punit_core_id) + return i; + } + + return -EINVAL; +} + +void set_cpu_mask_from_punit_coremask(int cpu, unsigned long long core_mask, + size_t core_cpumask_size, + cpu_set_t *core_cpumask, int *cpu_cnt) +{ + int i, cnt = 0; + int die_id, pkg_id; + + *cpu_cnt = 0; + die_id = get_physical_die_id(cpu); + pkg_id = get_physical_package_id(cpu); + + for (i = 0; i < 64; ++i) { + if (core_mask & BIT(i)) { + int j; + + for (j = 0; j < topo_max_cpus; ++j) { + if (cpu_map[j].pkg_id == pkg_id && + cpu_map[j].die_id == die_id && + cpu_map[j].punit_cpu_core == i) { + CPU_SET_S(j, core_cpumask_size, + core_cpumask); + ++cnt; + } + } + } + } + + *cpu_cnt = cnt; +} + +int find_phy_core_num(int logical_cpu) +{ + if (logical_cpu < topo_max_cpus) + return cpu_map[logical_cpu].punit_cpu_core; + + return -EINVAL; +} + +static int isst_send_mmio_command(unsigned int cpu, unsigned int reg, int write, + unsigned int *value) +{ + struct isst_if_io_regs io_regs; + const char *pathname = "/dev/isst_interface"; + int cmd; + int fd; + + debug_printf("mmio_cmd cpu:%d reg:%d write:%d\n", cpu, reg, write); + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + io_regs.req_count = 1; + io_regs.io_reg[0].logical_cpu = cpu; + io_regs.io_reg[0].reg = reg; + cmd = ISST_IF_IO_CMD; + if (write) { + io_regs.io_reg[0].read_write = 1; + io_regs.io_reg[0].value = *value; + } else { + io_regs.io_reg[0].read_write = 0; + } + + if (ioctl(fd, cmd, &io_regs) == -1) { + perror("ISST_IF_IO_CMD"); + fprintf(outf, "Error: mmio_cmd cpu:%d reg:%x read_write:%x\n", + cpu, reg, write); + } else { + if (!write) + *value = io_regs.io_reg[0].value; + + debug_printf( + "mmio_cmd response: cpu:%d reg:%x rd_write:%x resp:%x\n", + cpu, reg, write, *value); + } + + close(fd); + + return 0; +} + +int isst_send_mbox_command(unsigned int cpu, unsigned char command, + unsigned char sub_command, unsigned int parameter, + unsigned int req_data, unsigned int *resp) +{ + const char *pathname = "/dev/isst_interface"; + int fd; + struct isst_if_mbox_cmds mbox_cmds = { 0 }; + + debug_printf( + "mbox_send: cpu:%d command:%x sub_command:%x parameter:%x req_data:%x\n", + cpu, command, sub_command, parameter, req_data); + + if (isst_platform_info.mmio_supported && command == CONFIG_CLOS) { + unsigned int value; + int write = 0; + int clos_id, core_id, ret = 0; + + debug_printf("CLOS %d\n", cpu); + + if (parameter & BIT(MBOX_CMD_WRITE_BIT)) { + value = req_data; + write = 1; + } + + switch (sub_command) { + case CLOS_PQR_ASSOC: + core_id = parameter & 0xff; + ret = isst_send_mmio_command( + cpu, PQR_ASSOC_OFFSET + core_id * 4, write, + &value); + if (!ret && !write) + *resp = value; + break; + case CLOS_PM_CLOS: + clos_id = parameter & 0x03; + ret = isst_send_mmio_command( + cpu, PM_CLOS_OFFSET + clos_id * 4, write, + &value); + if (!ret && !write) + *resp = value; + break; + case CLOS_PM_QOS_CONFIG: + ret = isst_send_mmio_command(cpu, PM_QOS_CONFIG_OFFSET, + write, &value); + if (!ret && !write) + *resp = value; + break; + case CLOS_STATUS: + break; + default: + break; + } + return ret; + } + + mbox_cmds.cmd_count = 1; + mbox_cmds.mbox_cmd[0].logical_cpu = cpu; + mbox_cmds.mbox_cmd[0].command = command; + mbox_cmds.mbox_cmd[0].sub_command = sub_command; + mbox_cmds.mbox_cmd[0].parameter = parameter; + mbox_cmds.mbox_cmd[0].req_data = req_data; + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + if (ioctl(fd, ISST_IF_MBOX_COMMAND, &mbox_cmds) == -1) { + perror("ISST_IF_MBOX_COMMAND"); + fprintf(outf, + "Error: mbox_cmd cpu:%d command:%x sub_command:%x parameter:%x req_data:%x\n", + cpu, command, sub_command, parameter, req_data); + } else { + *resp = mbox_cmds.mbox_cmd[0].resp_data; + debug_printf( + "mbox_cmd response: cpu:%d command:%x sub_command:%x parameter:%x req_data:%x resp:%x\n", + cpu, command, sub_command, parameter, req_data, *resp); + } + + close(fd); + + return 0; +} + +int isst_send_msr_command(unsigned int cpu, unsigned int msr, int write, + unsigned long long *req_resp) +{ + struct isst_if_msr_cmds msr_cmds; + const char *pathname = "/dev/isst_interface"; + int fd; + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + msr_cmds.cmd_count = 1; + msr_cmds.msr_cmd[0].logical_cpu = cpu; + msr_cmds.msr_cmd[0].msr = msr; + msr_cmds.msr_cmd[0].read_write = write; + if (write) + msr_cmds.msr_cmd[0].data = *req_resp; + + if (ioctl(fd, ISST_IF_MSR_COMMAND, &msr_cmds) == -1) { + perror("ISST_IF_MSR_COMMAD"); + fprintf(outf, "Error: msr_cmd cpu:%d msr:%x read_write:%d\n", + cpu, msr, write); + } else { + if (!write) + *req_resp = msr_cmds.msr_cmd[0].data; + + debug_printf( + "msr_cmd response: cpu:%d msr:%x rd_write:%x resp:%llx %llx\n", + cpu, msr, write, *req_resp, msr_cmds.msr_cmd[0].data); + } + + close(fd); + + return 0; +} + +static int isst_fill_platform_info(void) +{ + const char *pathname = "/dev/isst_interface"; + int fd; + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + if (ioctl(fd, ISST_IF_GET_PLATFORM_INFO, &isst_platform_info) == -1) { + perror("ISST_IF_GET_PLATFORM_INFO"); + close(fd); + return -1; + } + + close(fd); + + return 0; +} + +static void isst_print_platform_information(void) +{ + struct isst_if_platform_info platform_info; + const char *pathname = "/dev/isst_interface"; + int fd; + + fd = open(pathname, O_RDWR); + if (fd < 0) + err(-1, "%s open failed", pathname); + + if (ioctl(fd, ISST_IF_GET_PLATFORM_INFO, &platform_info) == -1) { + perror("ISST_IF_GET_PLATFORM_INFO"); + } else { + fprintf(outf, "Platform: API version : %d\n", + platform_info.api_version); + fprintf(outf, "Platform: Driver version : %d\n", + platform_info.driver_version); + fprintf(outf, "Platform: mbox supported : %d\n", + platform_info.mbox_supported); + fprintf(outf, "Platform: mmio supported : %d\n", + platform_info.mmio_supported); + } + + close(fd); + + exit(0); +} + +static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int (*fn_ptr)(int cpu, void *arg); + int ret; + + fn_ptr = arg1; + ret = fn_ptr(cpu, arg2); + if (ret) + perror("get_tdp_*"); + else + isst_display_result(cpu, outf, "perf-profile", (char *)arg3, + *(unsigned int *)arg4); +} + +#define _get_tdp_level(desc, suffix, object, help) \ + static void get_tdp_##object(void) \ + { \ + struct isst_pkg_ctdp ctdp; \ +\ + if (cmd_help) { \ + fprintf(stderr, \ + "Print %s [No command arguments are required]\n", \ + help); \ + exit(0); \ + } \ + isst_ctdp_display_information_start(outf); \ + if (max_target_cpus) \ + for_each_online_target_cpu_in_set( \ + exec_on_get_ctdp_cpu, isst_get_ctdp_##suffix, \ + &ctdp, desc, &ctdp.object); \ + else \ + for_each_online_package_in_set(exec_on_get_ctdp_cpu, \ + isst_get_ctdp_##suffix, \ + &ctdp, desc, \ + &ctdp.object); \ + isst_ctdp_display_information_end(outf); \ + } + +_get_tdp_level("get-config-levels", levels, levels, "TDP levels"); +_get_tdp_level("get-config-version", levels, version, "TDP version"); +_get_tdp_level("get-config-enabled", levels, enabled, "TDP enable status"); +_get_tdp_level("get-config-current_level", levels, current_level, + "Current TDP Level"); +_get_tdp_level("get-lock-status", levels, locked, "TDP lock status"); + +static void dump_isst_config_for_cpu(int cpu, void *arg1, void *arg2, + void *arg3, void *arg4) +{ + struct isst_pkg_ctdp pkg_dev; + int ret; + + memset(&pkg_dev, 0, sizeof(pkg_dev)); + ret = isst_get_process_ctdp(cpu, tdp_level, &pkg_dev); + if (ret) { + perror("isst_get_process_ctdp"); + } else { + isst_ctdp_display_information(cpu, outf, tdp_level, &pkg_dev); + isst_get_process_ctdp_complete(cpu, &pkg_dev); + } +} + +static void dump_isst_config(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel(R) Speed Select Technology Performance profile configuration\n"); + fprintf(stderr, + "including base frequency and turbo frequency configurations\n"); + fprintf(stderr, "Optional: -l|--level : Specify tdp level\n"); + fprintf(stderr, + "\tIf no arguments, dump information for all TDP levels\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + + if (max_target_cpus) + for_each_online_target_cpu_in_set(dump_isst_config_for_cpu, + NULL, NULL, NULL, NULL); + else + for_each_online_package_in_set(dump_isst_config_for_cpu, NULL, + NULL, NULL, NULL); + + isst_ctdp_display_information_end(outf); +} + +static void set_tdp_level_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int ret; + + ret = isst_set_tdp_level(cpu, tdp_level); + if (ret) + perror("set_tdp_level_for_cpu"); + else + isst_display_result(cpu, outf, "perf-profile", "set_tdp_level", + ret); +} + +static void set_tdp_level(void) +{ + if (cmd_help) { + fprintf(stderr, "Set Config TDP level\n"); + fprintf(stderr, + "\t Arguments: -l|--level : Specify tdp level\n"); + exit(0); + } + + if (tdp_level == 0xff) { + fprintf(outf, "Invalid command: specify tdp_level\n"); + exit(1); + } + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_tdp_level_for_cpu, NULL, + NULL, NULL, NULL); + else + for_each_online_package_in_set(set_tdp_level_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); +} + +static void dump_pbf_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + struct isst_pbf_info pbf_info; + int ret; + + ret = isst_get_pbf_info(cpu, tdp_level, &pbf_info); + if (ret) { + perror("isst_get_pbf_info"); + } else { + isst_pbf_display_information(cpu, outf, tdp_level, &pbf_info); + isst_get_pbf_info_complete(&pbf_info); + } +} + +static void dump_pbf_config(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel(R) Speed Select Technology base frequency configuration for a TDP level\n"); + fprintf(stderr, + "\tArguments: -l|--level : Specify tdp level\n"); + exit(0); + } + + if (tdp_level == 0xff) { + fprintf(outf, "Invalid command: specify tdp_level\n"); + exit(1); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(dump_pbf_config_for_cpu, NULL, + NULL, NULL, NULL); + else + for_each_online_package_in_set(dump_pbf_config_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); +} + +static void set_pbf_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int ret; + int status = *(int *)arg4; + + ret = isst_set_pbf_fact_status(cpu, 1, status); + if (ret) { + perror("isst_set_pbf"); + } else { + if (status) + isst_display_result(cpu, outf, "base-freq", "enable", + ret); + else + isst_display_result(cpu, outf, "base-freq", "disable", + ret); + } +} + +static void set_pbf_enable(void) +{ + int status = 1; + + if (cmd_help) { + fprintf(stderr, + "Enable Intel Speed Select Technology base frequency feature [No command arguments are required]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_pbf_for_cpu, NULL, NULL, + NULL, &status); + else + for_each_online_package_in_set(set_pbf_for_cpu, NULL, NULL, + NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void set_pbf_disable(void) +{ + int status = 0; + + if (cmd_help) { + fprintf(stderr, + "Disable Intel Speed Select Technology base frequency feature [No command arguments are required]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_pbf_for_cpu, NULL, NULL, + NULL, &status); + else + for_each_online_package_in_set(set_pbf_for_cpu, NULL, NULL, + NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void dump_fact_config_for_cpu(int cpu, void *arg1, void *arg2, + void *arg3, void *arg4) +{ + struct isst_fact_info fact_info; + int ret; + + ret = isst_get_fact_info(cpu, tdp_level, &fact_info); + if (ret) + perror("isst_get_fact_bucket_info"); + else + isst_fact_display_information(cpu, outf, tdp_level, fact_bucket, + fact_avx, &fact_info); +} + +static void dump_fact_config(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print complete Intel Speed Select Technology turbo frequency configuration for a TDP level. Other arguments are optional.\n"); + fprintf(stderr, + "\tArguments: -l|--level : Specify tdp level\n"); + fprintf(stderr, + "\tArguments: -b|--bucket : Bucket index to dump\n"); + fprintf(stderr, + "\tArguments: -r|--trl-type : Specify trl type: sse|avx2|avx512\n"); + exit(0); + } + + if (tdp_level == 0xff) { + fprintf(outf, "Invalid command: specify tdp_level\n"); + exit(1); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(dump_fact_config_for_cpu, + NULL, NULL, NULL, NULL); + else + for_each_online_package_in_set(dump_fact_config_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); +} + +static void set_fact_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int ret; + int status = *(int *)arg4; + + ret = isst_set_pbf_fact_status(cpu, 0, status); + if (ret) + perror("isst_set_fact"); + else { + if (status) { + struct isst_pkg_ctdp pkg_dev; + + ret = isst_get_ctdp_levels(cpu, &pkg_dev); + if (ret) { + isst_display_result(cpu, outf, "turbo-freq", + "enable", ret); + return; + } + ret = isst_set_trl(cpu, fact_trl); + isst_display_result(cpu, outf, "turbo-freq", "enable", + ret); + } else { + /* Since we modified TRL during Fact enable, restore it */ + isst_set_trl_from_current_tdp(cpu, fact_trl); + isst_display_result(cpu, outf, "turbo-freq", "disable", + ret); + } + } +} + +static void set_fact_enable(void) +{ + int status = 1; + + if (cmd_help) { + fprintf(stderr, + "Enable Intel Speed Select Technology Turbo frequency feature\n"); + fprintf(stderr, + "Optional: -t|--trl : Specify turbo ratio limit\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_fact_for_cpu, NULL, NULL, + NULL, &status); + else + for_each_online_package_in_set(set_fact_for_cpu, NULL, NULL, + NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void set_fact_disable(void) +{ + int status = 0; + + if (cmd_help) { + fprintf(stderr, + "Disable Intel Speed Select Technology turbo frequency feature\n"); + fprintf(stderr, + "Optional: -t|--trl : Specify turbo ratio limit\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_fact_for_cpu, NULL, NULL, + NULL, &status); + else + for_each_online_package_in_set(set_fact_for_cpu, NULL, NULL, + NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void enable_clos_qos_config(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int ret; + int status = *(int *)arg4; + + ret = isst_pm_qos_config(cpu, status, clos_priority_type); + if (ret) { + perror("isst_pm_qos_config"); + } else { + if (status) + isst_display_result(cpu, outf, "core-power", "enable", + ret); + else + isst_display_result(cpu, outf, "core-power", "disable", + ret); + } +} + +static void set_clos_enable(void) +{ + int status = 1; + + if (cmd_help) { + fprintf(stderr, "Enable core-power for a package/die\n"); + fprintf(stderr, + "\tClos Enable: Specify priority type with [--priority|-p]\n"); + fprintf(stderr, "\t\t 0: Proportional, 1: Ordered\n"); + exit(0); + } + + if (cpufreq_sysfs_present()) { + fprintf(stderr, + "cpufreq subsystem and core-power enable will interfere with each other!\n"); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(enable_clos_qos_config, NULL, + NULL, NULL, &status); + else + for_each_online_package_in_set(enable_clos_qos_config, NULL, + NULL, NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void set_clos_disable(void) +{ + int status = 0; + + if (cmd_help) { + fprintf(stderr, + "Disable core-power: [No command arguments are required]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(enable_clos_qos_config, NULL, + NULL, NULL, &status); + else + for_each_online_package_in_set(enable_clos_qos_config, NULL, + NULL, NULL, &status); + isst_ctdp_display_information_end(outf); +} + +static void dump_clos_config_for_cpu(int cpu, void *arg1, void *arg2, + void *arg3, void *arg4) +{ + struct isst_clos_config clos_config; + int ret; + + ret = isst_pm_get_clos(cpu, current_clos, &clos_config); + if (ret) + perror("isst_pm_get_clos"); + else + isst_clos_display_information(cpu, outf, current_clos, + &clos_config); +} + +static void dump_clos_config(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel Speed Select Technology core power configuration\n"); + fprintf(stderr, + "\tArguments: [-c | --clos]: Specify clos id\n"); + exit(0); + } + if (current_clos < 0 || current_clos > 3) { + fprintf(stderr, "Invalid clos id\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(dump_clos_config_for_cpu, + NULL, NULL, NULL, NULL); + else + for_each_online_package_in_set(dump_clos_config_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); +} + +static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + struct isst_clos_config clos_config; + int ret; + + clos_config.pkg_id = get_physical_package_id(cpu); + clos_config.die_id = get_physical_die_id(cpu); + + clos_config.epp = clos_epp; + clos_config.clos_prop_prio = clos_prop_prio; + clos_config.clos_min = clos_min; + clos_config.clos_max = clos_max; + clos_config.clos_desired = clos_desired; + ret = isst_set_clos(cpu, current_clos, &clos_config); + if (ret) + perror("isst_set_clos"); + else + isst_display_result(cpu, outf, "core-power", "config", ret); +} + +static void set_clos_config(void) +{ + if (cmd_help) { + fprintf(stderr, + "Set core-power configuration for one of the four clos ids\n"); + fprintf(stderr, + "\tSpecify targeted clos id with [--clos|-c]\n"); + fprintf(stderr, "\tSpecify clos EPP with [--epp|-e]\n"); + fprintf(stderr, + "\tSpecify clos Proportional Priority [--weight|-w]\n"); + fprintf(stderr, "\tSpecify clos min with [--min|-n]\n"); + fprintf(stderr, "\tSpecify clos max with [--max|-m]\n"); + fprintf(stderr, "\tSpecify clos desired with [--desired|-d]\n"); + exit(0); + } + + if (current_clos < 0 || current_clos > 3) { + fprintf(stderr, "Invalid clos id\n"); + exit(0); + } + if (clos_epp < 0 || clos_epp > 0x0F) { + fprintf(stderr, "clos epp is not specified, default: 0\n"); + clos_epp = 0; + } + if (clos_prop_prio < 0 || clos_prop_prio > 0x0F) { + fprintf(stderr, + "clos frequency weight is not specified, default: 0\n"); + clos_prop_prio = 0; + } + if (clos_min < 0) { + fprintf(stderr, "clos min is not specified, default: 0\n"); + clos_min = 0; + } + if (clos_max < 0) { + fprintf(stderr, "clos max is not specified, default: 0xff\n"); + clos_max = 0xff; + } + if (clos_desired < 0) { + fprintf(stderr, "clos desired is not specified, default: 0\n"); + clos_desired = 0x00; + } + + isst_ctdp_display_information_start(outf); + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_clos_config_for_cpu, NULL, + NULL, NULL, NULL); + else + for_each_online_package_in_set(set_clos_config_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); +} + +static void set_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int ret; + + ret = isst_clos_associate(cpu, current_clos); + if (ret) + perror("isst_clos_associate"); + else + isst_display_result(cpu, outf, "core-power", "assoc", ret); +} + +static void set_clos_assoc(void) +{ + if (cmd_help) { + fprintf(stderr, "Associate a clos id to a CPU\n"); + fprintf(stderr, + "\tSpecify targeted clos id with [--clos|-c]\n"); + exit(0); + } + + if (current_clos < 0 || current_clos > 3) { + fprintf(stderr, "Invalid clos id\n"); + exit(0); + } + if (max_target_cpus) + for_each_online_target_cpu_in_set(set_clos_assoc_for_cpu, NULL, + NULL, NULL, NULL); + else { + fprintf(stderr, + "Invalid target cpu. Specify with [-c|--cpu]\n"); + } +} + +static void get_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int clos, ret; + + ret = isst_clos_get_assoc_status(cpu, &clos); + if (ret) + perror("isst_clos_get_assoc_status"); + else + isst_display_result(cpu, outf, "core-power", "get-assoc", clos); +} + +static void get_clos_assoc(void) +{ + if (cmd_help) { + fprintf(stderr, "Get associate clos id to a CPU\n"); + fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); + exit(0); + } + if (max_target_cpus) + for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, + NULL, NULL, NULL); + else { + fprintf(stderr, + "Invalid target cpu. Specify with [-c|--cpu]\n"); + } +} + +static struct process_cmd_struct isst_cmds[] = { + { "perf-profile", "get-lock-status", get_tdp_locked }, + { "perf-profile", "get-config-levels", get_tdp_levels }, + { "perf-profile", "get-config-version", get_tdp_version }, + { "perf-profile", "get-config-enabled", get_tdp_enabled }, + { "perf-profile", "get-config-current-level", get_tdp_current_level }, + { "perf-profile", "set-config-level", set_tdp_level }, + { "perf-profile", "info", dump_isst_config }, + { "base-freq", "info", dump_pbf_config }, + { "base-freq", "enable", set_pbf_enable }, + { "base-freq", "disable", set_pbf_disable }, + { "turbo-freq", "info", dump_fact_config }, + { "turbo-freq", "enable", set_fact_enable }, + { "turbo-freq", "disable", set_fact_disable }, + { "core-power", "info", dump_clos_config }, + { "core-power", "enable", set_clos_enable }, + { "core-power", "disable", set_clos_disable }, + { "core-power", "config", set_clos_config }, + { "core-power", "assoc", set_clos_assoc }, + { "core-power", "get-assoc", get_clos_assoc }, + { NULL, NULL, NULL } +}; + +/* + * parse cpuset with following syntax + * 1,2,4..6,8-10 and set bits in cpu_subset + */ +void parse_cpu_command(char *optarg) +{ + unsigned int start, end; + char *next; + + next = optarg; + + while (next && *next) { + if (*next == '-') /* no negative cpu numbers */ + goto error; + + start = strtoul(next, &next, 10); + + if (max_target_cpus < MAX_CPUS_IN_ONE_REQ) + target_cpus[max_target_cpus++] = start; + + if (*next == '\0') + break; + + if (*next == ',') { + next += 1; + continue; + } + + if (*next == '-') { + next += 1; /* start range */ + } else if (*next == '.') { + next += 1; + if (*next == '.') + next += 1; /* start range */ + else + goto error; + } + + end = strtoul(next, &next, 10); + if (end <= start) + goto error; + + while (++start <= end) { + if (max_target_cpus < MAX_CPUS_IN_ONE_REQ) + target_cpus[max_target_cpus++] = start; + } + + if (*next == ',') + next += 1; + else if (*next != '\0') + goto error; + } + +#ifdef DEBUG + { + int i; + + for (i = 0; i < max_target_cpus; ++i) + printf("cpu [%d] in arg\n", target_cpus[i]); + } +#endif + return; + +error: + fprintf(stderr, "\"--cpu %s\" malformed\n", optarg); + exit(-1); +} + +static void parse_cmd_args(int argc, int start, char **argv) +{ + int opt; + int option_index; + + static struct option long_options[] = { + { "bucket", required_argument, 0, 'b' }, + { "level", required_argument, 0, 'l' }, + { "trl-type", required_argument, 0, 'r' }, + { "trl", required_argument, 0, 't' }, + { "help", no_argument, 0, 'h' }, + { "clos", required_argument, 0, 'c' }, + { "desired", required_argument, 0, 'd' }, + { "epp", required_argument, 0, 'e' }, + { "min", required_argument, 0, 'n' }, + { "max", required_argument, 0, 'm' }, + { "priority", required_argument, 0, 'p' }, + { "weight", required_argument, 0, 'w' }, + { 0, 0, 0, 0 } + }; + + option_index = start; + + optind = start + 1; + while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:h", + long_options, &option_index)) != -1) { + switch (opt) { + case 'b': + fact_bucket = atoi(optarg); + break; + case 'h': + cmd_help = 1; + break; + case 'l': + tdp_level = atoi(optarg); + break; + case 't': + sscanf(optarg, "0x%llx", &fact_trl); + break; + case 'r': + if (!strncmp(optarg, "sse", 3)) { + fact_avx = 0x01; + } else if (!strncmp(optarg, "avx2", 4)) { + fact_avx = 0x02; + } else if (!strncmp(optarg, "avx512", 4)) { + fact_avx = 0x04; + } else { + fprintf(outf, "Invalid sse,avx options\n"); + exit(1); + } + break; + /* CLOS related */ + case 'c': + current_clos = atoi(optarg); + printf("clos %d\n", current_clos); + break; + case 'd': + clos_desired = atoi(optarg); + break; + case 'e': + clos_epp = atoi(optarg); + break; + case 'n': + clos_min = atoi(optarg); + break; + case 'm': + clos_max = atoi(optarg); + break; + case 'p': + clos_priority_type = atoi(optarg); + break; + case 'w': + clos_prop_prio = atoi(optarg); + break; + default: + printf("no match\n"); + } + } +} + +static void isst_help(void) +{ + printf("perf-profile:\tAn architectural mechanism that allows multiple optimized \n\ + performance profiles per system via static and/or dynamic\n\ + adjustment of core count, workload, Tjmax, and\n\ + TDP, etc.\n"); + printf("\nCommands : For feature=perf-profile\n"); + printf("\tinfo\n"); + printf("\tget-lock-status\n"); + printf("\tget-config-levels\n"); + printf("\tget-config-version\n"); + printf("\tget-config-enabled\n"); + printf("\tget-config-current-level\n"); + printf("\tset-config-level\n"); +} + +static void pbf_help(void) +{ + printf("base-freq:\tEnables users to increase guaranteed base frequency\n\ + on certain cores (high priority cores) in exchange for lower\n\ + base frequency on remaining cores (low priority cores).\n"); + printf("\tcommand : info\n"); + printf("\tcommand : enable\n"); + printf("\tcommand : disable\n"); +} + +static void fact_help(void) +{ + printf("turbo-freq:\tEnables the ability to set different turbo ratio\n\ + limits to cores based on priority.\n"); + printf("\nCommand: For feature=turbo-freq\n"); + printf("\tcommand : info\n"); + printf("\tcommand : enable\n"); + printf("\tcommand : disable\n"); +} + +static void core_power_help(void) +{ + printf("core-power:\tInterface that allows user to define per core/tile\n\ + priority.\n"); + printf("\nCommands : For feature=core-power\n"); + printf("\tinfo\n"); + printf("\tenable\n"); + printf("\tdisable\n"); + printf("\tconfig\n"); + printf("\tassoc\n"); + printf("\tget-assoc\n"); +} + +struct process_cmd_help_struct { + char *feature; + void (*process_fn)(void); +}; + +static struct process_cmd_help_struct isst_help_cmds[] = { + { "perf-profile", isst_help }, + { "base-freq", pbf_help }, + { "turbo-freq", fact_help }, + { "core-power", core_power_help }, + { NULL, NULL } +}; + +void process_command(int argc, char **argv) +{ + int i = 0, matched = 0; + char *feature = argv[optind]; + char *cmd = argv[optind + 1]; + + if (!feature || !cmd) + return; + + debug_printf("feature name [%s] command [%s]\n", feature, cmd); + if (!strcmp(cmd, "-h") || !strcmp(cmd, "--help")) { + while (isst_help_cmds[i].feature) { + if (!strcmp(isst_help_cmds[i].feature, feature)) { + isst_help_cmds[i].process_fn(); + exit(0); + } + ++i; + } + } + + create_cpu_map(); + + i = 0; + while (isst_cmds[i].feature) { + if (!strcmp(isst_cmds[i].feature, feature) && + !strcmp(isst_cmds[i].command, cmd)) { + parse_cmd_args(argc, optind + 1, argv); + isst_cmds[i].process_fn(); + matched = 1; + break; + } + ++i; + } + + if (!matched) + fprintf(stderr, "Invalid command\n"); +} + +static void usage(void) +{ + printf("Intel(R) Speed Select Technology\n"); + printf("\nUsage:\n"); + printf("intel-speed-select [OPTIONS] FEATURE COMMAND COMMAND_ARGUMENTS\n"); + printf("\nUse this tool to enumerate and control the Intel Speed Select Technology features,\n"); + printf("\nFEATURE : [perf-profile|base-freq|turbo-freq|core-power]\n"); + printf("\nFor help on each feature, use --h|--help\n"); + printf("\tFor example: intel-speed-select perf-profile -h\n"); + + printf("\nFor additional help on each command for a feature, use --h|--help\n"); + printf("\tFor example: intel-speed-select perf-profile get-lock-status -h\n"); + printf("\t\t This will print help for the command \"get-lock-status\" for the feature \"perf-profile\"\n"); + + printf("\nOPTIONS\n"); + printf("\t[-c|--cpu] : logical cpu number\n"); + printf("\t\tDefault: Die scoped for all dies in the system with multiple dies/package\n"); + printf("\t\t\t Or Package scoped for all Packages when each package contains one die\n"); + printf("\t[-d|--debug] : Debug mode\n"); + printf("\t[-h|--help] : Print help\n"); + printf("\t[-i|--info] : Print platform information\n"); + printf("\t[-o|--out] : Output file\n"); + printf("\t\t\tDefault : stderr\n"); + printf("\t[-f|--format] : output format [json|text]. Default: text\n"); + printf("\t[-v|--version] : Print version\n"); + + printf("\nResult format\n"); + printf("\tResult display uses a common format for each command:\n"); + printf("\tResults are formatted in text/JSON with\n"); + printf("\t\tPackage, Die, CPU, and command specific results.\n"); + printf("\t\t\tFor Set commands, status is 0 for success and rest for failures\n"); + exit(1); +} + +static void print_version(void) +{ + fprintf(outf, "Version %s\n", version_str); + fprintf(outf, "Build date %s time %s\n", __DATE__, __TIME__); + exit(0); +} + +static void cmdline(int argc, char **argv) +{ + int opt; + int option_index = 0; + + static struct option long_options[] = { + { "cpu", required_argument, 0, 'c' }, + { "debug", no_argument, 0, 'd' }, + { "format", required_argument, 0, 'f' }, + { "help", no_argument, 0, 'h' }, + { "info", no_argument, 0, 'i' }, + { "out", required_argument, 0, 'o' }, + { "version", no_argument, 0, 'v' }, + { 0, 0, 0, 0 } + }; + + progname = argv[0]; + while ((opt = getopt_long_only(argc, argv, "+c:df:hio:v", long_options, + &option_index)) != -1) { + switch (opt) { + case 'c': + parse_cpu_command(optarg); + break; + case 'd': + debug_flag = 1; + printf("Debug Mode ON\n"); + break; + case 'f': + if (!strncmp(optarg, "json", 4)) + out_format_json = 1; + break; + case 'h': + usage(); + break; + case 'i': + isst_print_platform_information(); + break; + case 'o': + if (outf) + fclose(outf); + outf = fopen_or_exit(optarg, "w"); + break; + case 'v': + print_version(); + break; + default: + usage(); + } + } + + if (geteuid() != 0) { + fprintf(stderr, "Must run as root\n"); + exit(0); + } + + if (optind > (argc - 2)) { + fprintf(stderr, "Feature name and|or command not specified\n"); + exit(0); + } + update_cpu_model(); + printf("Intel(R) Speed Select Technology\n"); + printf("Executing on CPU model:%d[0x%x]\n", cpu_model, cpu_model); + set_max_cpu_num(); + set_cpu_present_cpu_mask(); + set_cpu_target_cpu_mask(); + isst_fill_platform_info(); + if (isst_platform_info.api_version > supported_api_ver) { + printf("Incompatible API versions; Upgrade of tool is required\n"); + exit(0); + } + + process_command(argc, argv); +} + +int main(int argc, char **argv) +{ + outf = stderr; + cmdline(argc, argv); + return 0; +} diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c new file mode 100644 index 000000000000..8de4ac39a008 --- /dev/null +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -0,0 +1,721 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Speed Select -- Enumerate and control features + * Copyright (c) 2019 Intel Corporation. + */ + +#include "isst.h" + +int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_LEVELS_INFO, 0, 0, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_GET_LEVELS_INFO resp:%x\n", cpu, resp); + + pkg_dev->version = resp & 0xff; + pkg_dev->levels = (resp >> 8) & 0xff; + pkg_dev->current_level = (resp >> 16) & 0xff; + pkg_dev->locked = !!(resp & BIT(24)); + pkg_dev->enabled = !!(resp & BIT(31)); + + return 0; +} + +int isst_get_ctdp_control(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_TDP_CONTROL, 0, + config_index, &resp); + if (ret) + return ret; + + ctdp_level->fact_support = resp & BIT(0); + ctdp_level->pbf_support = !!(resp & BIT(1)); + ctdp_level->fact_enabled = !!(resp & BIT(16)); + ctdp_level->pbf_enabled = !!(resp & BIT(17)); + + debug_printf( + "cpu:%d CONFIG_TDP_GET_TDP_CONTROL resp:%x fact_support:%d pbf_support: %d fact_enabled:%d pbf_enabled:%d\n", + cpu, resp, ctdp_level->fact_support, ctdp_level->pbf_support, + ctdp_level->fact_enabled, ctdp_level->pbf_enabled); + + return 0; +} + +int isst_get_tdp_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_GET_TDP_INFO, + 0, config_index, &resp); + if (ret) + return ret; + + ctdp_level->pkg_tdp = resp & GENMASK(14, 0); + ctdp_level->tdp_ratio = (resp & GENMASK(23, 16)) >> 16; + + debug_printf( + "cpu:%d ctdp:%d CONFIG_TDP_GET_TDP_INFO resp:%x tdp_ratio:%d pkg_tdp:%d\n", + cpu, config_index, resp, ctdp_level->tdp_ratio, + ctdp_level->pkg_tdp); + return 0; +} + +int isst_get_pwr_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_GET_PWR_INFO, + 0, config_index, &resp); + if (ret) + return ret; + + ctdp_level->pkg_max_power = resp & GENMASK(14, 0); + ctdp_level->pkg_min_power = (resp & GENMASK(30, 16)) >> 16; + + debug_printf( + "cpu:%d ctdp:%d CONFIG_TDP_GET_PWR_INFO resp:%x pkg_max_power:%d pkg_min_power:%d\n", + cpu, config_index, resp, ctdp_level->pkg_max_power, + ctdp_level->pkg_min_power); + + return 0; +} + +int isst_get_tjmax_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_GET_TJMAX_INFO, + 0, config_index, &resp); + if (ret) + return ret; + + ctdp_level->t_proc_hot = resp & GENMASK(7, 0); + + debug_printf( + "cpu:%d ctdp:%d CONFIG_TDP_GET_TJMAX_INFO resp:%x t_proc_hot:%d\n", + cpu, config_index, resp, ctdp_level->t_proc_hot); + + return 0; +} + +int isst_get_coremask_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level) +{ + unsigned int resp; + int i, ret; + + ctdp_level->cpu_count = 0; + for (i = 0; i < 2; ++i) { + unsigned long long mask; + int cpu_count = 0; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_CORE_MASK, 0, + (i << 8) | config_index, &resp); + if (ret) + return ret; + + debug_printf( + "cpu:%d ctdp:%d mask:%d CONFIG_TDP_GET_CORE_MASK resp:%x\n", + cpu, config_index, i, resp); + + mask = (unsigned long long)resp << (32 * i); + set_cpu_mask_from_punit_coremask(cpu, mask, + ctdp_level->core_cpumask_size, + ctdp_level->core_cpumask, + &cpu_count); + ctdp_level->cpu_count += cpu_count; + debug_printf("cpu:%d ctdp:%d mask:%d cpu count:%d\n", cpu, + config_index, i, ctdp_level->cpu_count); + } + + return 0; +} + +int isst_get_get_trl(int cpu, int level, int avx_level, int *trl) +{ + unsigned int req, resp; + int ret; + + req = level | (avx_level << 16); + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_TURBO_LIMIT_RATIOS, 0, req, + &resp); + if (ret) + return ret; + + debug_printf( + "cpu:%d CONFIG_TDP_GET_TURBO_LIMIT_RATIOS req:%x resp:%x\n", + cpu, req, resp); + + trl[0] = resp & GENMASK(7, 0); + trl[1] = (resp & GENMASK(15, 8)) >> 8; + trl[2] = (resp & GENMASK(23, 16)) >> 16; + trl[3] = (resp & GENMASK(31, 24)) >> 24; + + req = level | BIT(8) | (avx_level << 16); + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_TURBO_LIMIT_RATIOS, 0, req, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_GET_TURBO_LIMIT req:%x resp:%x\n", cpu, + req, resp); + + trl[4] = resp & GENMASK(7, 0); + trl[5] = (resp & GENMASK(15, 8)) >> 8; + trl[6] = (resp & GENMASK(23, 16)) >> 16; + trl[7] = (resp & GENMASK(31, 24)) >> 24; + + return 0; +} + +int isst_set_tdp_level_msr(int cpu, int tdp_level) +{ + int ret; + + debug_printf("cpu: tdp_level via MSR %d\n", cpu, tdp_level); + + if (isst_get_config_tdp_lock_status(cpu)) { + debug_printf("cpu: tdp_locked %d\n", cpu); + return -1; + } + + if (tdp_level > 2) + return -1; /* invalid value */ + + ret = isst_send_msr_command(cpu, 0x64b, 1, + (unsigned long long *)&tdp_level); + if (ret) + return ret; + + debug_printf("cpu: tdp_level via MSR successful %d\n", cpu, tdp_level); + + return 0; +} + +int isst_set_tdp_level(int cpu, int tdp_level) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_SET_LEVEL, 0, + tdp_level, &resp); + if (ret) + return isst_set_tdp_level_msr(cpu, tdp_level); + + return 0; +} + +int isst_get_pbf_info(int cpu, int level, struct isst_pbf_info *pbf_info) +{ + unsigned int req, resp; + int i, ret; + + pbf_info->core_cpumask_size = alloc_cpu_set(&pbf_info->core_cpumask); + + for (i = 0; i < 2; ++i) { + unsigned long long mask; + int count; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_PBF_GET_CORE_MASK_INFO, + 0, (i << 8) | level, &resp); + if (ret) + return ret; + + debug_printf( + "cpu:%d CONFIG_TDP_PBF_GET_CORE_MASK_INFO resp:%x\n", + cpu, resp); + + mask = (unsigned long long)resp << (32 * i); + set_cpu_mask_from_punit_coremask(cpu, mask, + pbf_info->core_cpumask_size, + pbf_info->core_cpumask, + &count); + } + + req = level; + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_PBF_GET_P1HI_P1LO_INFO, 0, req, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_PBF_GET_P1HI_P1LO_INFO resp:%x\n", cpu, + resp); + + pbf_info->p1_low = resp & 0xff; + pbf_info->p1_high = (resp & GENMASK(15, 8)) >> 8; + + req = level; + ret = isst_send_mbox_command( + cpu, CONFIG_TDP, CONFIG_TDP_PBF_GET_TDP_INFO, 0, req, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_PBF_GET_TDP_INFO resp:%x\n", cpu, resp); + + pbf_info->tdp = resp & 0xffff; + + req = level; + ret = isst_send_mbox_command( + cpu, CONFIG_TDP, CONFIG_TDP_PBF_GET_TJ_MAX_INFO, 0, req, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_PBF_GET_TJ_MAX_INFO resp:%x\n", cpu, + resp); + pbf_info->t_control = (resp >> 8) & 0xff; + pbf_info->t_prochot = resp & 0xff; + + return 0; +} + +void isst_get_pbf_info_complete(struct isst_pbf_info *pbf_info) +{ + free_cpu_set(pbf_info->core_cpumask); +} + +int isst_set_pbf_fact_status(int cpu, int pbf, int enable) +{ + struct isst_pkg_ctdp pkg_dev; + struct isst_pkg_ctdp_level_info ctdp_level; + int current_level; + unsigned int req = 0, resp; + int ret; + + ret = isst_get_ctdp_levels(cpu, &pkg_dev); + if (ret) + return ret; + + current_level = pkg_dev.current_level; + + ret = isst_get_ctdp_control(cpu, current_level, &ctdp_level); + if (ret) + return ret; + + if (pbf) { + if (ctdp_level.fact_enabled) + req = BIT(16); + + if (enable) + req |= BIT(17); + else + req &= ~BIT(17); + } else { + if (ctdp_level.pbf_enabled) + req = BIT(17); + + if (enable) + req |= BIT(16); + else + req &= ~BIT(16); + } + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_SET_TDP_CONTROL, 0, req, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_SET_TDP_CONTROL pbf/fact:%d req:%x\n", + cpu, pbf, req); + + return 0; +} + +int isst_get_fact_bucket_info(int cpu, int level, + struct isst_fact_bucket_info *bucket_info) +{ + unsigned int resp; + int i, k, ret; + + for (i = 0; i < 2; ++i) { + int j; + + ret = isst_send_mbox_command( + cpu, CONFIG_TDP, + CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_NUMCORES, 0, + (i << 8) | level, &resp); + if (ret) + return ret; + + debug_printf( + "cpu:%d CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_NUMCORES index:%d level:%d resp:%x\n", + cpu, i, level, resp); + + for (j = 0; j < 4; ++j) { + bucket_info[j + (i * 4)].high_priority_cores_count = + (resp >> (j * 8)) & 0xff; + } + } + + for (k = 0; k < 3; ++k) { + for (i = 0; i < 2; ++i) { + int j; + + ret = isst_send_mbox_command( + cpu, CONFIG_TDP, + CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_RATIOS, 0, + (k << 16) | (i << 8) | level, &resp); + if (ret) + return ret; + + debug_printf( + "cpu:%d CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_RATIOS index:%d level:%d avx:%d resp:%x\n", + cpu, i, level, k, resp); + + for (j = 0; j < 4; ++j) { + switch (k) { + case 0: + bucket_info[j + (i * 4)].sse_trl = + (resp >> (j * 8)) & 0xff; + break; + case 1: + bucket_info[j + (i * 4)].avx_trl = + (resp >> (j * 8)) & 0xff; + break; + case 2: + bucket_info[j + (i * 4)].avx512_trl = + (resp >> (j * 8)) & 0xff; + break; + default: + break; + } + } + } + } + + return 0; +} + +int isst_get_fact_info(int cpu, int level, struct isst_fact_info *fact_info) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_TDP, + CONFIG_TDP_GET_FACT_LP_CLIPPING_RATIO, 0, + level, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CONFIG_TDP_GET_FACT_LP_CLIPPING_RATIO resp:%x\n", + cpu, resp); + + fact_info->lp_clipping_ratio_license_sse = resp & 0xff; + fact_info->lp_clipping_ratio_license_avx2 = (resp >> 8) & 0xff; + fact_info->lp_clipping_ratio_license_avx512 = (resp >> 16) & 0xff; + + ret = isst_get_fact_bucket_info(cpu, level, fact_info->bucket_info); + + return ret; +} + +int isst_set_trl(int cpu, unsigned long long trl) +{ + int ret; + + if (!trl) + trl = 0xFFFFFFFFFFFFFFFFULL; + + ret = isst_send_msr_command(cpu, 0x1AD, 1, &trl); + if (ret) + return ret; + + return 0; +} + +int isst_set_trl_from_current_tdp(int cpu, unsigned long long trl) +{ + unsigned long long msr_trl; + int ret; + + if (trl) { + msr_trl = trl; + } else { + struct isst_pkg_ctdp pkg_dev; + int trl[8]; + int i; + + ret = isst_get_ctdp_levels(cpu, &pkg_dev); + if (ret) + return ret; + + ret = isst_get_get_trl(cpu, pkg_dev.current_level, 0, trl); + if (ret) + return ret; + + msr_trl = 0; + for (i = 0; i < 8; ++i) { + unsigned long long _trl = trl[i]; + + msr_trl |= (_trl << (i * 8)); + } + } + ret = isst_send_msr_command(cpu, 0x1AD, 1, &msr_trl); + if (ret) + return ret; + + return 0; +} + +/* Return 1 if locked */ +int isst_get_config_tdp_lock_status(int cpu) +{ + unsigned long long tdp_control = 0; + int ret; + + ret = isst_send_msr_command(cpu, 0x64b, 0, &tdp_control); + if (ret) + return ret; + + ret = !!(tdp_control & BIT(31)); + + return ret; +} + +void isst_get_process_ctdp_complete(int cpu, struct isst_pkg_ctdp *pkg_dev) +{ + int i; + + if (!pkg_dev->processed) + return; + + for (i = 0; i < pkg_dev->levels; ++i) { + struct isst_pkg_ctdp_level_info *ctdp_level; + + ctdp_level = &pkg_dev->ctdp_level[i]; + if (ctdp_level->pbf_support) + free_cpu_set(ctdp_level->pbf_info.core_cpumask); + free_cpu_set(ctdp_level->core_cpumask); + } +} + +int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) +{ + int i, ret; + + if (pkg_dev->processed) + return 0; + + ret = isst_get_ctdp_levels(cpu, pkg_dev); + if (ret) + return ret; + + debug_printf("cpu: %d ctdp enable:%d current level: %d levels:%d\n", + cpu, pkg_dev->enabled, pkg_dev->current_level, + pkg_dev->levels); + + for (i = 0; i <= pkg_dev->levels; ++i) { + struct isst_pkg_ctdp_level_info *ctdp_level; + + if (tdp_level != 0xff && i != tdp_level) + continue; + + debug_printf("cpu:%d Get Information for TDP level:%d\n", cpu, + i); + ctdp_level = &pkg_dev->ctdp_level[i]; + + ctdp_level->processed = 1; + ctdp_level->level = i; + ctdp_level->control_cpu = cpu; + ctdp_level->pkg_id = get_physical_package_id(cpu); + ctdp_level->die_id = get_physical_die_id(cpu); + + ret = isst_get_ctdp_control(cpu, i, ctdp_level); + if (ret) + return ret; + + ret = isst_get_tdp_info(cpu, i, ctdp_level); + if (ret) + return ret; + + ret = isst_get_pwr_info(cpu, i, ctdp_level); + if (ret) + return ret; + + ret = isst_get_tjmax_info(cpu, i, ctdp_level); + if (ret) + return ret; + + ctdp_level->core_cpumask_size = + alloc_cpu_set(&ctdp_level->core_cpumask); + ret = isst_get_coremask_info(cpu, i, ctdp_level); + if (ret) + return ret; + + ret = isst_get_get_trl(cpu, i, 0, + ctdp_level->trl_sse_active_cores); + if (ret) + return ret; + + ret = isst_get_get_trl(cpu, i, 1, + ctdp_level->trl_avx_active_cores); + if (ret) + return ret; + + ret = isst_get_get_trl(cpu, i, 2, + ctdp_level->trl_avx_512_active_cores); + if (ret) + return ret; + + if (ctdp_level->pbf_support) { + ret = isst_get_pbf_info(cpu, i, &ctdp_level->pbf_info); + if (!ret) + ctdp_level->pbf_found = 1; + } + + if (ctdp_level->fact_support) { + ret = isst_get_fact_info(cpu, i, + &ctdp_level->fact_info); + if (ret) + return ret; + } + } + + pkg_dev->processed = 1; + + return 0; +} + +int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) +{ + unsigned int req, resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp); + + req = resp; + + if (enable_clos) + req = req | BIT(1); + else + req = req & ~BIT(1); + + if (priority_type) + req = req | BIT(2); + else + req = req & ~BIT(2); + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, + BIT(MBOX_CMD_WRITE_BIT), req, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_QOS_CONFIG priority type:%d req:%x\n", cpu, + priority_type, req); + + return 0; +} + +int isst_pm_get_clos(int cpu, int clos, struct isst_clos_config *clos_config) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_CLOS, clos, 0, + &resp); + if (ret) + return ret; + + clos_config->pkg_id = get_physical_package_id(cpu); + clos_config->die_id = get_physical_die_id(cpu); + + clos_config->epp = resp & 0x0f; + clos_config->clos_prop_prio = (resp >> 4) & 0x0f; + clos_config->clos_min = (resp >> 8) & 0xff; + clos_config->clos_max = (resp >> 16) & 0xff; + clos_config->clos_desired = (resp >> 24) & 0xff; + + return 0; +} + +int isst_set_clos(int cpu, int clos, struct isst_clos_config *clos_config) +{ + unsigned int req, resp; + unsigned int param; + int ret; + + req = clos_config->epp & 0x0f; + req |= (clos_config->clos_prop_prio & 0x0f) << 4; + req |= (clos_config->clos_min & 0xff) << 8; + req |= (clos_config->clos_max & 0xff) << 16; + req |= (clos_config->clos_desired & 0xff) << 24; + + param = BIT(MBOX_CMD_WRITE_BIT) | clos; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_CLOS, param, req, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_CLOS param:%x req:%x\n", cpu, param, req); + + return 0; +} + +int isst_clos_get_assoc_status(int cpu, int *clos_id) +{ + unsigned int resp; + unsigned int param; + int core_id, ret; + + core_id = find_phy_core_num(cpu); + param = core_id; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PQR_ASSOC, param, 0, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PQR_ASSOC param:%x resp:%x\n", cpu, param, + resp); + *clos_id = (resp >> 16) & 0x03; + + return 0; +} + +int isst_clos_associate(int cpu, int clos_id) +{ + unsigned int req, resp; + unsigned int param; + int core_id, ret; + + req = (clos_id & 0x03) << 16; + core_id = find_phy_core_num(cpu); + param = BIT(MBOX_CMD_WRITE_BIT) | core_id; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PQR_ASSOC, param, + req, &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PQR_ASSOC param:%x req:%x\n", cpu, param, + req); + + return 0; +} diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c new file mode 100644 index 000000000000..f368b8323742 --- /dev/null +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel dynamic_speed_select -- Enumerate and control features + * Copyright (c) 2019 Intel Corporation. + */ + +#include "isst.h" + +#define DISP_FREQ_MULTIPLIER 100000 + +static void printcpumask(int str_len, char *str, int mask_size, + cpu_set_t *cpu_mask) +{ + int i, max_cpus = get_topo_max_cpus(); + unsigned int *mask; + int size, index, curr_index; + + size = max_cpus / (sizeof(unsigned int) * 8); + if (max_cpus % (sizeof(unsigned int) * 8)) + size++; + + mask = calloc(size, sizeof(unsigned int)); + if (!mask) + return; + + for (i = 0; i < max_cpus; ++i) { + int mask_index, bit_index; + + if (!CPU_ISSET_S(i, mask_size, cpu_mask)) + continue; + + mask_index = i / (sizeof(unsigned int) * 8); + bit_index = i % (sizeof(unsigned int) * 8); + mask[mask_index] |= BIT(bit_index); + } + + curr_index = 0; + for (i = size - 1; i >= 0; --i) { + index = snprintf(&str[curr_index], str_len - curr_index, "%08x", + mask[i]); + curr_index += index; + if (i) { + strncat(&str[curr_index], ",", str_len - curr_index); + curr_index++; + } + } + + free(mask); +} + +static void format_and_print_txt(FILE *outf, int level, char *header, + char *value) +{ + char *spaces = " "; + static char delimiters[256]; + int i, j = 0; + + if (!level) + return; + + if (level == 1) { + strcpy(delimiters, " "); + } else { + for (i = 0; i < level - 1; ++i) + j += snprintf(&delimiters[j], sizeof(delimiters) - j, + "%s", spaces); + } + + if (header && value) { + fprintf(outf, "%s", delimiters); + fprintf(outf, "%s:%s\n", header, value); + } else if (header) { + fprintf(outf, "%s", delimiters); + fprintf(outf, "%s\n", header); + } +} + +static int last_level; +static void format_and_print(FILE *outf, int level, char *header, char *value) +{ + char *spaces = " "; + static char delimiters[256]; + int i; + + if (!out_format_is_json()) { + format_and_print_txt(outf, level, header, value); + return; + } + + if (level == 0) { + if (header) + fprintf(outf, "{"); + else + fprintf(outf, "\n}\n"); + + } else { + int j = 0; + + for (i = 0; i < level; ++i) + j += snprintf(&delimiters[j], sizeof(delimiters) - j, + "%s", spaces); + + if (last_level == level) + fprintf(outf, ",\n"); + + if (value) { + if (last_level != level) + fprintf(outf, "\n"); + + fprintf(outf, "%s\"%s\": ", delimiters, header); + fprintf(outf, "\"%s\"", value); + } else { + for (i = last_level - 1; i >= level; --i) { + int k = 0; + + for (j = i; j > 0; --j) + k += snprintf(&delimiters[k], + sizeof(delimiters) - k, + "%s", spaces); + if (i == level && header) + fprintf(outf, "\n%s},", delimiters); + else + fprintf(outf, "\n%s}", delimiters); + } + if (abs(last_level - level) < 3) + fprintf(outf, "\n"); + if (header) + fprintf(outf, "%s\"%s\": {", delimiters, + header); + } + } + + last_level = level; +} + +static void print_packag_info(int cpu, FILE *outf) +{ + char header[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); +} + +static void _isst_pbf_display_information(int cpu, FILE *outf, int level, + struct isst_pbf_info *pbf_info, + int disp_level) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "speed-select-base-freq"); + format_and_print(outf, disp_level, header, NULL); + + snprintf(header, sizeof(header), "high-priority-base-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + pbf_info->p1_high * DISP_FREQ_MULTIPLIER); + format_and_print(outf, disp_level + 1, header, value); + + snprintf(header, sizeof(header), "high-priority-cpu-mask"); + printcpumask(sizeof(value), value, pbf_info->core_cpumask_size, + pbf_info->core_cpumask); + format_and_print(outf, disp_level + 1, header, value); + + snprintf(header, sizeof(header), "low-priority-base-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + pbf_info->p1_low * DISP_FREQ_MULTIPLIER); + format_and_print(outf, disp_level + 1, header, value); + + snprintf(header, sizeof(header), "tjunction-temperature(C)"); + snprintf(value, sizeof(value), "%d", pbf_info->t_prochot); + format_and_print(outf, disp_level + 1, header, value); + + snprintf(header, sizeof(header), "thermal-design-power(W)"); + snprintf(value, sizeof(value), "%d", pbf_info->tdp); + format_and_print(outf, disp_level + 1, header, value); +} + +static void _isst_fact_display_information(int cpu, FILE *outf, int level, + int fact_bucket, int fact_avx, + struct isst_fact_info *fact_info, + int base_level) +{ + struct isst_fact_bucket_info *bucket_info = fact_info->bucket_info; + char header[256]; + char value[256]; + int j; + + snprintf(header, sizeof(header), "speed-select-turbo-freq"); + format_and_print(outf, base_level, header, NULL); + for (j = 0; j < ISST_FACT_MAX_BUCKETS; ++j) { + if (fact_bucket != 0xff && fact_bucket != j) + continue; + + if (!bucket_info[j].high_priority_cores_count) + break; + + snprintf(header, sizeof(header), "bucket-%d", j); + format_and_print(outf, base_level + 1, header, NULL); + + snprintf(header, sizeof(header), "high-priority-cores-count"); + snprintf(value, sizeof(value), "%d", + bucket_info[j].high_priority_cores_count); + format_and_print(outf, base_level + 2, header, value); + + if (fact_avx & 0x01) { + snprintf(header, sizeof(header), + "high-priority-max-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + bucket_info[j].sse_trl * DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); + } + + if (fact_avx & 0x02) { + snprintf(header, sizeof(header), + "high-priority-max-avx2-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + bucket_info[j].avx_trl * DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); + } + + if (fact_avx & 0x04) { + snprintf(header, sizeof(header), + "high-priority-max-avx512-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + bucket_info[j].avx512_trl * + DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); + } + } + snprintf(header, sizeof(header), + "speed-select-turbo-freq-clip-frequencies"); + format_and_print(outf, base_level + 1, header, NULL); + snprintf(header, sizeof(header), "low-priority-max-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + fact_info->lp_clipping_ratio_license_sse * + DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); + snprintf(header, sizeof(header), + "low-priority-max-avx2-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + fact_info->lp_clipping_ratio_license_avx2 * + DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); + snprintf(header, sizeof(header), + "low-priority-max-avx512-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + fact_info->lp_clipping_ratio_license_avx512 * + DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 2, header, value); +} + +void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, + struct isst_pkg_ctdp *pkg_dev) +{ + char header[256]; + char value[256]; + int i, base_level = 1; + + print_packag_info(cpu, outf); + + for (i = 0; i <= pkg_dev->levels; ++i) { + struct isst_pkg_ctdp_level_info *ctdp_level; + int j; + + ctdp_level = &pkg_dev->ctdp_level[i]; + if (!ctdp_level->processed) + continue; + + snprintf(header, sizeof(header), "perf-profile-level-%d", + ctdp_level->level); + format_and_print(outf, base_level + 3, header, NULL); + + snprintf(header, sizeof(header), "cpu-count"); + j = get_cpu_count(get_physical_die_id(cpu), + get_physical_die_id(cpu)); + snprintf(value, sizeof(value), "%d", j); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "enable-cpu-mask"); + printcpumask(sizeof(value), value, + ctdp_level->core_cpumask_size, + ctdp_level->core_cpumask); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "thermal-design-power-ratio"); + snprintf(value, sizeof(value), "%d", ctdp_level->tdp_ratio); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "base-frequency(KHz)"); + snprintf(value, sizeof(value), "%d", + ctdp_level->tdp_ratio * DISP_FREQ_MULTIPLIER); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), + "speed-select-turbo-freq-support"); + snprintf(value, sizeof(value), "%d", ctdp_level->fact_support); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), + "speed-select-base-freq-support"); + snprintf(value, sizeof(value), "%d", ctdp_level->pbf_support); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), + "speed-select-base-freq-enabled"); + snprintf(value, sizeof(value), "%d", ctdp_level->pbf_enabled); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), + "speed-select-turbo-freq-enabled"); + snprintf(value, sizeof(value), "%d", ctdp_level->fact_enabled); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "thermal-design-power(W)"); + snprintf(value, sizeof(value), "%d", ctdp_level->pkg_tdp); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "tjunction-max(C)"); + snprintf(value, sizeof(value), "%d", ctdp_level->t_proc_hot); + format_and_print(outf, base_level + 4, header, value); + + snprintf(header, sizeof(header), "turbo-ratio-limits-sse"); + format_and_print(outf, base_level + 4, header, NULL); + for (j = 0; j < 8; ++j) { + snprintf(header, sizeof(header), "bucket-%d", j); + format_and_print(outf, base_level + 5, header, NULL); + + snprintf(header, sizeof(header), "core-count"); + snprintf(value, sizeof(value), "%d", j); + format_and_print(outf, base_level + 6, header, value); + + snprintf(header, sizeof(header), "turbo-ratio"); + snprintf(value, sizeof(value), "%d", + ctdp_level->trl_sse_active_cores[j]); + format_and_print(outf, base_level + 6, header, value); + } + snprintf(header, sizeof(header), "turbo-ratio-limits-avx"); + format_and_print(outf, base_level + 4, header, NULL); + for (j = 0; j < 8; ++j) { + snprintf(header, sizeof(header), "bucket-%d", j); + format_and_print(outf, base_level + 5, header, NULL); + + snprintf(header, sizeof(header), "core-count"); + snprintf(value, sizeof(value), "%d", j); + format_and_print(outf, base_level + 6, header, value); + + snprintf(header, sizeof(header), "turbo-ratio"); + snprintf(value, sizeof(value), "%d", + ctdp_level->trl_avx_active_cores[j]); + format_and_print(outf, base_level + 6, header, value); + } + + snprintf(header, sizeof(header), "turbo-ratio-limits-avx512"); + format_and_print(outf, base_level + 4, header, NULL); + for (j = 0; j < 8; ++j) { + snprintf(header, sizeof(header), "bucket-%d", j); + format_and_print(outf, base_level + 5, header, NULL); + + snprintf(header, sizeof(header), "core-count"); + snprintf(value, sizeof(value), "%d", j); + format_and_print(outf, base_level + 6, header, value); + + snprintf(header, sizeof(header), "turbo-ratio"); + snprintf(value, sizeof(value), "%d", + ctdp_level->trl_avx_512_active_cores[j]); + format_and_print(outf, base_level + 6, header, value); + } + if (ctdp_level->pbf_support) + _isst_pbf_display_information(cpu, outf, i, + &ctdp_level->pbf_info, + base_level + 4); + if (ctdp_level->fact_support) + _isst_fact_display_information(cpu, outf, i, 0xff, 0xff, + &ctdp_level->fact_info, + base_level + 4); + } + + format_and_print(outf, 1, NULL, NULL); +} + +void isst_ctdp_display_information_start(FILE *outf) +{ + last_level = 0; + format_and_print(outf, 0, "start", NULL); +} + +void isst_ctdp_display_information_end(FILE *outf) +{ + format_and_print(outf, 0, NULL, NULL); +} + +void isst_pbf_display_information(int cpu, FILE *outf, int level, + struct isst_pbf_info *pbf_info) +{ + print_packag_info(cpu, outf); + _isst_pbf_display_information(cpu, outf, level, pbf_info, 4); + format_and_print(outf, 1, NULL, NULL); +} + +void isst_fact_display_information(int cpu, FILE *outf, int level, + int fact_bucket, int fact_avx, + struct isst_fact_info *fact_info) +{ + print_packag_info(cpu, outf); + _isst_fact_display_information(cpu, outf, level, fact_bucket, fact_avx, + fact_info, 4); + format_and_print(outf, 1, NULL, NULL); +} + +void isst_clos_display_information(int cpu, FILE *outf, int clos, + struct isst_clos_config *clos_config) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "core-power"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "clos"); + snprintf(value, sizeof(value), "%d", clos); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "epp"); + snprintf(value, sizeof(value), "%d", clos_config->epp); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "clos-proportional-priority"); + snprintf(value, sizeof(value), "%d", clos_config->clos_prop_prio); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "clos-min"); + snprintf(value, sizeof(value), "%d", clos_config->clos_min); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "clos-max"); + snprintf(value, sizeof(value), "%d", clos_config->clos_max); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "clos-desired"); + snprintf(value, sizeof(value), "%d", clos_config->clos_desired); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + +void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, + int result) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + snprintf(header, sizeof(header), "%s", feature); + format_and_print(outf, 4, header, NULL); + snprintf(header, sizeof(header), "%s", cmd); + snprintf(value, sizeof(value), "%d", result); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h new file mode 100644 index 000000000000..221881761609 --- /dev/null +++ b/tools/power/x86/intel-speed-select/isst.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel Speed Select -- Enumerate and control features + * Copyright (c) 2019 Intel Corporation. + */ + +#ifndef _ISST_H_ +#define _ISST_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define BIT(x) (1 << (x)) +#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h)))) +#define GENMASK_ULL(h, l) \ + (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h)))) + +#define CONFIG_TDP 0x7f +#define CONFIG_TDP_GET_LEVELS_INFO 0x00 +#define CONFIG_TDP_GET_TDP_CONTROL 0x01 +#define CONFIG_TDP_SET_TDP_CONTROL 0x02 +#define CONFIG_TDP_GET_TDP_INFO 0x03 +#define CONFIG_TDP_GET_PWR_INFO 0x04 +#define CONFIG_TDP_GET_TJMAX_INFO 0x05 +#define CONFIG_TDP_GET_CORE_MASK 0x06 +#define CONFIG_TDP_GET_TURBO_LIMIT_RATIOS 0x07 +#define CONFIG_TDP_SET_LEVEL 0x08 +#define CONFIG_TDP_GET_UNCORE_P0_P1_INFO 0X09 +#define CONFIG_TDP_GET_P1_INFO 0x0a +#define CONFIG_TDP_GET_MEM_FREQ 0x0b + +#define CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_NUMCORES 0x10 +#define CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_RATIOS 0x11 +#define CONFIG_TDP_GET_FACT_LP_CLIPPING_RATIO 0x12 + +#define CONFIG_TDP_PBF_GET_CORE_MASK_INFO 0x20 +#define CONFIG_TDP_PBF_GET_P1HI_P1LO_INFO 0x21 +#define CONFIG_TDP_PBF_GET_TJ_MAX_INFO 0x22 +#define CONFIG_TDP_PBF_GET_TDP_INFO 0X23 + +#define CONFIG_CLOS 0xd0 +#define CLOS_PQR_ASSOC 0x00 +#define CLOS_PM_CLOS 0x01 +#define CLOS_PM_QOS_CONFIG 0x02 +#define CLOS_STATUS 0x03 + +#define MBOX_CMD_WRITE_BIT 0x08 + +#define PM_QOS_INFO_OFFSET 0x00 +#define PM_QOS_CONFIG_OFFSET 0x04 +#define PM_CLOS_OFFSET 0x08 +#define PQR_ASSOC_OFFSET 0x20 + +struct isst_clos_config { + int pkg_id; + int die_id; + unsigned char epp; + unsigned char clos_prop_prio; + unsigned char clos_min; + unsigned char clos_max; + unsigned char clos_desired; +}; + +struct isst_fact_bucket_info { + int high_priority_cores_count; + int sse_trl; + int avx_trl; + int avx512_trl; +}; + +struct isst_pbf_info { + int pbf_acticated; + int pbf_available; + size_t core_cpumask_size; + cpu_set_t *core_cpumask; + int p1_high; + int p1_low; + int t_control; + int t_prochot; + int tdp; +}; + +#define ISST_TRL_MAX_ACTIVE_CORES 8 +#define ISST_FACT_MAX_BUCKETS 8 +struct isst_fact_info { + int lp_clipping_ratio_license_sse; + int lp_clipping_ratio_license_avx2; + int lp_clipping_ratio_license_avx512; + struct isst_fact_bucket_info bucket_info[ISST_FACT_MAX_BUCKETS]; +}; + +struct isst_pkg_ctdp_level_info { + int processed; + int control_cpu; + int pkg_id; + int die_id; + int level; + int fact_support; + int pbf_support; + int fact_enabled; + int pbf_enabled; + int tdp_ratio; + int active; + int tdp_control; + int pkg_tdp; + int pkg_min_power; + int pkg_max_power; + int fact; + int t_proc_hot; + int uncore_p0; + int uncore_p1; + int sse_p1; + int avx2_p1; + int avx512_p1; + int mem_freq; + size_t core_cpumask_size; + cpu_set_t *core_cpumask; + int cpu_count; + int trl_sse_active_cores[ISST_TRL_MAX_ACTIVE_CORES]; + int trl_avx_active_cores[ISST_TRL_MAX_ACTIVE_CORES]; + int trl_avx_512_active_cores[ISST_TRL_MAX_ACTIVE_CORES]; + int kobj_bucket_index; + int active_bucket; + int fact_max_index; + int fact_max_config; + int pbf_found; + int pbf_active; + struct isst_pbf_info pbf_info; + struct isst_fact_info fact_info; +}; + +#define ISST_MAX_TDP_LEVELS (4 + 1) /* +1 for base config */ +struct isst_pkg_ctdp { + int locked; + int version; + int processed; + int levels; + int current_level; + int enabled; + struct isst_pkg_ctdp_level_info ctdp_level[ISST_MAX_TDP_LEVELS]; +}; + +extern int get_topo_max_cpus(void); +extern int get_cpu_count(int pkg_id, int die_id); + +/* Common interfaces */ +extern void debug_printf(const char *format, ...); +extern int out_format_is_json(void); +extern int get_physical_package_id(int cpu); +extern int get_physical_die_id(int cpu); +extern size_t alloc_cpu_set(cpu_set_t **cpu_set); +extern void free_cpu_set(cpu_set_t *cpu_set); +extern int find_logical_cpu(int pkg_id, int die_id, int phy_cpu); +extern int find_phy_cpu_num(int logical_cpu); +extern int find_phy_core_num(int logical_cpu); +extern void set_cpu_mask_from_punit_coremask(int cpu, + unsigned long long core_mask, + size_t core_cpumask_size, + cpu_set_t *core_cpumask, + int *cpu_cnt); + +extern int isst_send_mbox_command(unsigned int cpu, unsigned char command, + unsigned char sub_command, + unsigned int write, + unsigned int req_data, unsigned int *resp); + +extern int isst_send_msr_command(unsigned int cpu, unsigned int command, + int write, unsigned long long *req_resp); + +extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev); +extern int isst_get_process_ctdp(int cpu, int tdp_level, + struct isst_pkg_ctdp *pkg_dev); +extern void isst_get_process_ctdp_complete(int cpu, + struct isst_pkg_ctdp *pkg_dev); +extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, + struct isst_pkg_ctdp *pkg_dev); +extern void isst_ctdp_display_information_start(FILE *outf); +extern void isst_ctdp_display_information_end(FILE *outf); +extern void isst_pbf_display_information(int cpu, FILE *outf, int level, + struct isst_pbf_info *info); +extern int isst_set_tdp_level(int cpu, int tdp_level); +extern int isst_set_tdp_level_msr(int cpu, int tdp_level); +extern int isst_set_pbf_fact_status(int cpu, int pbf, int enable); +extern int isst_get_pbf_info(int cpu, int level, + struct isst_pbf_info *pbf_info); +extern void isst_get_pbf_info_complete(struct isst_pbf_info *pbf_info); +extern int isst_get_fact_info(int cpu, int level, + struct isst_fact_info *fact_info); +extern int isst_get_fact_bucket_info(int cpu, int level, + struct isst_fact_bucket_info *bucket_info); +extern void isst_fact_display_information(int cpu, FILE *outf, int level, + int fact_bucket, int fact_avx, + struct isst_fact_info *fact_info); +extern int isst_set_trl(int cpu, unsigned long long trl); +extern int isst_set_trl_from_current_tdp(int cpu, unsigned long long trl); +extern int isst_get_config_tdp_lock_status(int cpu); + +extern int isst_pm_qos_config(int cpu, int enable_clos, int priority_type); +extern int isst_pm_get_clos(int cpu, int clos, + struct isst_clos_config *clos_config); +extern int isst_set_clos(int cpu, int clos, + struct isst_clos_config *clos_config); +extern int isst_clos_associate(int cpu, int clos); +extern int isst_clos_get_assoc_status(int cpu, int *clos_id); +extern void isst_clos_display_information(int cpu, FILE *outf, int clos, + struct isst_clos_config *clos_config); + +extern int isst_read_reg(unsigned short reg, unsigned int *val); +extern int isst_write_reg(int reg, unsigned int val); + +extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, + int result); +#endif -- cgit v1.2.3 From 5a1ea4774ddc2c6bc3ba1415880091eccf1a901e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 3 Jul 2019 22:33:59 +0530 Subject: powerpc/pseries: Move mm/book3s64/vphn.c under platforms/pseries/ hcall_vphn() is specific to pseries and will be used in a subsequent patch. So, move it to a more appropriate place under arch/powerpc/platforms/pseries. Also merge vphn.h into lppaca.h and update vphn selftest to use the new files. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/lppaca.h | 24 ++++++ arch/powerpc/mm/book3s64/Makefile | 1 - arch/powerpc/mm/book3s64/vphn.c | 73 ------------------- arch/powerpc/mm/book3s64/vphn.h | 24 ------ arch/powerpc/mm/numa.c | 14 ---- arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/vphn.c | 89 +++++++++++++++++++++++ tools/testing/selftests/powerpc/vphn/Makefile | 2 +- tools/testing/selftests/powerpc/vphn/asm/lppaca.h | 1 + tools/testing/selftests/powerpc/vphn/vphn.c | 2 +- tools/testing/selftests/powerpc/vphn/vphn.h | 1 - 11 files changed, 117 insertions(+), 115 deletions(-) delete mode 100644 arch/powerpc/mm/book3s64/vphn.c delete mode 100644 arch/powerpc/mm/book3s64/vphn.h create mode 100644 arch/powerpc/platforms/pseries/vphn.c create mode 120000 tools/testing/selftests/powerpc/vphn/asm/lppaca.h delete mode 120000 tools/testing/selftests/powerpc/vphn/vphn.h (limited to 'tools') diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index e45b7293414d..f5195b4d9ffb 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -18,6 +18,29 @@ */ #ifndef _ASM_POWERPC_LPPACA_H #define _ASM_POWERPC_LPPACA_H + +/* + * The below VPHN macros are outside the __KERNEL__ check since these are + * used for compiling the vphn selftest in userspace + */ + +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ +#define VPHN_REGISTER_COUNT 6 + +/* + * 6 64-bit registers unpacked into up to 24 be32 associativity values. To + * form the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) + +/* + * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags: + * 1 for retrieving associativity information for a guest cpu + * 2 for retrieving associativity information for a host/hypervisor cpu + */ +#define VPHN_FLAG_VCPU 1 +#define VPHN_FLAG_PCPU 2 + #ifdef __KERNEL__ /* @@ -179,6 +202,7 @@ extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); extern void register_dtl_buffer(int cpu); extern void alloc_dtl_buffers(void); +extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); #endif /* CONFIG_PPC_BOOK3S */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 974b4fc19f4f..fd393b8be14f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_PPC_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c deleted file mode 100644 index 0ee7734afb50..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* - * Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* - * Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h deleted file mode 100644 index f7ff1e0c3801..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -/* - * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags: - * 1 for retrieving associativity information for a guest cpu - * 2 for retrieving associativity information for a host/hypervisor cpu - */ -#define VPHN_FLAG_VCPU 1 -#define VPHN_FLAG_PCPU 2 - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 57f006b6214b..50fadc99897b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1067,9 +1067,6 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR - -#include "book3s64/vphn.h" - struct topology_update_data { struct topology_update_data *next; unsigned int cpu; @@ -1087,17 +1084,6 @@ static void reset_topology_timer(void); static int topology_timer_secs = 1; static int topology_inited; -static long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity) -{ - long rc; - long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; - - rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu); - vphn_unpack_associativity(retbuf, associativity); - - return rc; -} - /* * Change polling interval for associativity changes. */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a43ec843c8e2..ab3d59aeacca 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/vphn.c b/arch/powerpc/platforms/pseries/vphn.c new file mode 100644 index 000000000000..3f07bf6c670e --- /dev/null +++ b/arch/powerpc/platforms/pseries/vphn.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +/* + * The associativity domain numbers are returned from the hypervisor as a + * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the + * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 + * bytes. + * + * --- 16-bit fields --> + * _________________________ + * | 0 | 1 | 2 | 3 | be_packed[0] + * ------+-----+-----+------ + * _________________________ + * | 4 | 5 | 6 | 7 | be_packed[1] + * ------------------------- + * ... + * _________________________ + * | 20 | 21 | 22 | 23 | be_packed[5] + * ------------------------- + * + * Convert to the sequence they would appear in the ibm,associativity property. + */ +static int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + __be64 be_packed[VPHN_REGISTER_COUNT]; + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) be_packed; + u16 last = 0; + bool is_32bit = false; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + /* Let's fix the values returned by plpar_hcall9() */ + for (i = 0; i < VPHN_REGISTER_COUNT; i++) + be_packed[i] = cpu_to_be64(packed[i]); + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + u16 new = be16_to_cpup(field++); + + if (is_32bit) { + /* + * Let's concatenate the 16 bits of this field to the + * 15 lower bits of the previous field + */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(last << 16 | new); + is_32bit = false; + } else if (new == VPHN_FIELD_UNUSED) + /* This is the list terminator */ + break; + else if (new & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(new & VPHN_FIELD_MASK); + } else { + /* + * Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + last = new; + is_32bit = true; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} + +/* NOTE: This file is included by a selftest and built in userspace. */ +#ifdef __KERNEL__ +#include + +long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} +#endif diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index 18b885da01bd..cf65cbf33085 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 +CFLAGS += -m64 -I$(CURDIR) top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/vphn/asm/lppaca.h b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h new file mode 120000 index 000000000000..942b1d00999c --- /dev/null +++ b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/lppaca.h \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c index 1d1f5f2be3b2..5b5fbddccabd 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.c +++ b/tools/testing/selftests/powerpc/vphn/vphn.c @@ -1 +1 @@ -../../../../../arch/powerpc/mm/book3s64/vphn.c \ No newline at end of file +../../../../../arch/powerpc/platforms/pseries/vphn.c \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.h b/tools/testing/selftests/powerpc/vphn/vphn.h deleted file mode 120000 index 45fe160f8288..000000000000 --- a/tools/testing/selftests/powerpc/vphn/vphn.h +++ /dev/null @@ -1 +0,0 @@ -../../../../../arch/powerpc/mm/book3s64/vphn.h \ No newline at end of file -- cgit v1.2.3 From 87b512def792579641499d9bef1d640994ea9c18 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 27 Jun 2019 20:50:46 -0500 Subject: objtool: Add support for C jump tables Objtool doesn't know how to read C jump tables, so it has to whitelist functions which use them, causing missing ORC unwinder data for such functions, e.g. ___bpf_prog_run(). C jump tables are very similar to GCC switch jump tables, which objtool already knows how to read. So adding support for C jump tables is easy. It just needs to be able to find the tables and distinguish them from other data. To allow the jump tables to be found, create an __annotate_jump_table macro which can be used to annotate them. The annotation is done by placing the jump table in an .rodata..c_jump_table section. The '.rodata' prefix ensures that the data will be placed in the rodata section by the vmlinux linker script. The double periods are part of an existing convention which distinguishes kernel sections from GCC sections. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Song Liu Cc: Kairui Song Cc: Steven Rostedt Cc: Borislav Petkov Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 5 +++++ tools/objtool/check.c | 27 ++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8aaf7cd026b0..f0fd5636fddb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ".pushsection .discard.unreachable\n\t" \ ".long 999b - .\n\t" \ ".popsection\n\t" + +/* Annotate a C jump table to allow objtool to follow the code flow */ +#define __annotate_jump_table __section(".rodata..c_jump_table") + #else #define annotate_reachable() #define annotate_unreachable() +#define __annotate_jump_table #endif #ifndef ASM_UNREACHABLE diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 172f99195726..27818a93f0b1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -18,6 +18,8 @@ #define FAKE_JUMP_OFFSET -1 +#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" + struct alternative { struct list_head list; struct instruction *insn; @@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file, /* * Make sure the .rodata address isn't associated with a - * symbol. gcc jump tables are anonymous data. + * symbol. GCC jump tables are anonymous data. + * + * Also support C jump tables which are in the same format as + * switch jump tables. For objtool to recognize them, they + * need to be placed in the C_JUMP_TABLE_SECTION section. They + * have symbols associated with them. */ - if (find_symbol_containing(rodata_sec, table_offset)) + if (find_symbol_containing(rodata_sec, table_offset) && + strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION)) continue; rodata_rela = find_rela_by_dest(rodata_sec, table_offset); @@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file) bool found = false; /* - * This searches for the .rodata section or multiple .rodata.func_name - * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8 - * rodata sections are ignored as they don't contain jump tables. + * Search for the following rodata sections, each of which can + * potentially contain jump tables: + * + * - .rodata: can contain GCC switch tables + * - .rodata.: same, if -fdata-sections is being used + * - .rodata..c_jump_table: contains C annotated jump tables + * + * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if (!strncmp(sec->name, ".rodata", 7) && - !strstr(sec->name, ".str1.")) { + if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) || + !strcmp(sec->name, C_JUMP_TABLE_SECTION)) { sec->rodata = true; found = true; } -- cgit v1.2.3 From 4e4cf62b37da5ff45c904a3acf242ab29ed5881d Mon Sep 17 00:00:00 2001 From: Numfor Mbiziwo-Tiapo Date: Tue, 2 Jul 2019 10:37:15 -0700 Subject: perf test mmap-thread-lookup: Initialize variable to suppress memory sanitizer warning Running the 'perf test' command after building perf with a memory sanitizer causes a warning that says: WARNING: MemorySanitizer: use-of-uninitialized-value... in mmap-thread-lookup.c Initializing the go variable to 0 silences this harmless warning. Committer warning: This was harmless, just a simple test writing whatever was at that sizeof(int) memory area just to signal another thread blocked reading that file created with pipe(). Initialize it tho so that we don't get this warning. Signed-off-by: Numfor Mbiziwo-Tiapo Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Drayton Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190702173716.181223-1-nums@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/mmap-thread-lookup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index ba87e6e8d18c..0a4301a5155c 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -53,7 +53,7 @@ static void *thread_fn(void *arg) { struct thread_data *td = arg; ssize_t ret; - int go; + int go = 0; if (thread_init(td)) return NULL; -- cgit v1.2.3 From c74b05030edb3b52f4208d8415b8c933bc509a29 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:11 +0800 Subject: perf stat: Fix use-after-freed pointer detected by the smatch tool Based on the following report from Smatch, fix the use-after-freed pointer. tools/perf/builtin-stat.c:1353 add_default_attributes() warn: passing freed memory 'str'. The pointer 'str' has been freed but later it is still passed into the function parse_events_print_error(). This patch fixes this use-after-freed issue. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: Davidlohr Bueso Cc: David S. Miller Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: linux-arm-kernel@lists.infradead.org Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Link: http://lkml.kernel.org/r/20190702103420.27540-3-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e5e19b461061..b81f7b197d24 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1349,8 +1349,8 @@ static int add_default_attributes(void) fprintf(stderr, "Cannot set up top down events %s: %d\n", str, err); - free(str); parse_events_print_error(&errinfo, str); + free(str); return -1; } } else { -- cgit v1.2.3 From 111442cfc8abdeaa7ec1407f07ef7b3e5f76654e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:12 +0800 Subject: perf top: Fix potential NULL pointer dereference detected by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/builtin-top.c:109 perf_top__parse_source() warn: variable dereferenced before check 'he' (see line 103) tools/perf/builtin-top.c:233 perf_top__show_details() warn: variable dereferenced before check 'he' (see line 228) tools/perf/builtin-top.c 101 static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) 102 { 103 struct perf_evsel *evsel = hists_to_evsel(he->hists); ^^^^ 104 struct symbol *sym; 105 struct annotation *notes; 106 struct map *map; 107 int err = -1; 108 109 if (!he || !he->ms.sym) 110 return -1; This patch moves the values assignment after validating pointer 'he'. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: David S. Miller Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190702103420.27540-4-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 6d40a4ef58c5..b46b3c9f57a0 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -101,7 +101,7 @@ static void perf_top__resize(struct perf_top *top) static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) { - struct perf_evsel *evsel = hists_to_evsel(he->hists); + struct perf_evsel *evsel; struct symbol *sym; struct annotation *notes; struct map *map; @@ -110,6 +110,8 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) if (!he || !he->ms.sym) return -1; + evsel = hists_to_evsel(he->hists); + sym = he->ms.sym; map = he->ms.map; @@ -226,7 +228,7 @@ static void perf_top__record_precise_ip(struct perf_top *top, static void perf_top__show_details(struct perf_top *top) { struct hist_entry *he = top->sym_filter_entry; - struct perf_evsel *evsel = hists_to_evsel(he->hists); + struct perf_evsel *evsel; struct annotation *notes; struct symbol *symbol; int more; @@ -234,6 +236,8 @@ static void perf_top__show_details(struct perf_top *top) if (!he) return; + evsel = hists_to_evsel(he->hists); + symbol = he->ms.sym; notes = symbol__annotation(symbol); -- cgit v1.2.3 From 600c787dbf6521d8d07ee717ab7606d5070103ea Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:13 +0800 Subject: perf annotate: Fix dereferencing freed memory found by the smatch tool Based on the following report from Smatch, fix the potential dereferencing freed memory check. tools/perf/util/annotate.c:1125 disasm_line__parse() error: dereferencing freed memory 'namep' tools/perf/util/annotate.c 1100 static int disasm_line__parse(char *line, const char **namep, char **rawp) 1101 { 1102 char tmp, *name = ltrim(line); [...] 1114 *namep = strdup(name); 1115 1116 if (*namep == NULL) 1117 goto out_free_name; [...] 1124 out_free_name: 1125 free((void *)namep); ^^^^^ 1126 *namep = NULL; ^^^^^^ 1127 return -1; 1128 } If strdup() fails to allocate memory space for *namep, we don't need to free memory with pointer 'namep', which is resident in data structure disasm_line::ins::name; and *namep is NULL pointer for this failure, so it's pointless to assign NULL to *namep again. Committer note: Freeing namep, which is the address of the first entry of the 'struct ins' that is the first member of struct disasm_line would in fact free that disasm_line instance, if it was allocated via malloc/calloc, which, later, would a dereference of freed memory. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: David S. Miller Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190702103420.27540-5-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ec7aaf31c2b2..944a6507a5e3 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1119,16 +1119,14 @@ static int disasm_line__parse(char *line, const char **namep, char **rawp) *namep = strdup(name); if (*namep == NULL) - goto out_free_name; + goto out; (*rawp)[0] = tmp; *rawp = skip_spaces(*rawp); return 0; -out_free_name: - free((void *)namep); - *namep = NULL; +out: return -1; } -- cgit v1.2.3 From 7a6d49dc8cad8fa1f3d63994102af8f9ae9c859f Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:14 +0800 Subject: perf trace: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/builtin-trace.c:1044 thread_trace__new() error: we previously assumed 'ttrace' could be null (see line 1041). tools/perf/builtin-trace.c 1037 static struct thread_trace *thread_trace__new(void) 1038 { 1039 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 1040 1041 if (ttrace) 1042 ttrace->files.max = -1; 1043 1044 ttrace->syscall_stats = intlist__new(NULL); ^^^^^^^^ 1045 1046 return ttrace; 1047 } Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: David S. Miller Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190702103420.27540-6-leo.yan@linaro.org [ Just made it look like other tools/perf constructors, same end result ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d0eb7224dd36..e3fc9062f136 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1038,10 +1038,10 @@ static struct thread_trace *thread_trace__new(void) { struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); - if (ttrace) + if (ttrace) { ttrace->files.max = -1; - - ttrace->syscall_stats = intlist__new(NULL); + ttrace->syscall_stats = intlist__new(NULL); + } return ttrace; } -- cgit v1.2.3 From 363bbaef63ffebcc745239fe80a953ebb5ac9ec9 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:16 +0800 Subject: perf map: Fix potential NULL pointer dereference found by smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/util/map.c:479 map__fprintf_srccode() error: we previously assumed 'state' could be null (see line 466) tools/perf/util/map.c 465 /* Avoid redundant printing */ 466 if (state && 467 state->srcfile && 468 !strcmp(state->srcfile, srcfile) && 469 state->line == line) { 470 free(srcfile); 471 return 0; 472 } 473 474 srccode = find_sourceline(srcfile, line, &len); 475 if (!srccode) 476 goto out_free_line; 477 478 ret = fprintf(fp, "|%-8d %.*s", line, len, srccode); 479 state->srcfile = srcfile; ^^^^^^^ 480 state->line = line; ^^^^^^^ This patch validates 'state' pointer before access its elements. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: David S. Miller Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Cc: linux-arm-kernel@lists.infradead.org Fixes: dd2e18e9ac20 ("perf tools: Support 'srccode' output") Link: http://lkml.kernel.org/r/20190702103420.27540-8-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 6fce983c6115..5f87975d2562 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -476,8 +476,11 @@ int map__fprintf_srccode(struct map *map, u64 addr, goto out_free_line; ret = fprintf(fp, "|%-8d %.*s", line, len, srccode); - state->srcfile = srcfile; - state->line = line; + + if (state) { + state->srcfile = srcfile; + state->line = line; + } return ret; out_free_line: -- cgit v1.2.3 From 40978e9bf2137223993e70921de2731201788049 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 3 Jul 2019 16:02:09 -0300 Subject: perf inject: The tool->read() call may pass a NULL evsel, handle it Check first, as machines__deliver_event() may have perf_evlist__id2evsel() returning NULL. This was found while checking a report from Leo Yan that used the smatch tool to find places where a pointer is checked before use and then, later in the same function gets used without checking. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-muvb8xqyh0gysgfjfq35w642@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8e0e06d3edfc..f4591a1438b4 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -224,7 +224,7 @@ static int perf_event__repipe_sample(struct perf_tool *tool, struct perf_evsel *evsel, struct machine *machine) { - if (evsel->handler) { + if (evsel && evsel->handler) { inject_handler f = evsel->handler; return f(tool, event, sample, evsel, machine); } -- cgit v1.2.3 From f3c8d90757724982e5f07cd77d315eb64ca145ac Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jul 2019 18:34:17 +0800 Subject: perf session: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/util/session.c:1252 dump_read() error: we previously assumed 'evsel' could be null (see line 1249) tools/perf/util/session.c 1240 static void dump_read(struct perf_evsel *evsel, union perf_event *event) 1241 { 1242 struct read_event *read_event = &event->read; 1243 u64 read_format; 1244 1245 if (!dump_trace) 1246 return; 1247 1248 printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, 1249 evsel ? perf_evsel__name(evsel) : "FAIL", 1250 event->read.value); 1251 1252 read_format = evsel->attr.read_format; ^^^^^^^ 'evsel' could be NULL pointer, for this case this patch directly bails out without dumping read_event. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Alexios Zavras Cc: Andi Kleen Cc: Changbin Du Cc: David S. Miller Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jin Yao Cc: Konstantin Khlebnikov Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Song Liu Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Thomas Richter Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190702103420.27540-9-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 54cf163347f7..2e61dd6a3574 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1249,6 +1249,9 @@ static void dump_read(struct perf_evsel *evsel, union perf_event *event) evsel ? perf_evsel__name(evsel) : "FAIL", event->read.value); + if (!evsel) + return; + read_format = evsel->attr.read_format; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) -- cgit v1.2.3 From fc50e0ba9bcac92ff177ff3ac64644108b6d8dd8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 3 Jul 2019 16:12:51 -0300 Subject: perf evsel: perf_evsel__name(NULL) is valid, no need to check evsel It'll return "unknown", no need to open code it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-4okvjmm18arjrcyfhuahgfxm@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- tools/perf/util/session.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index aef59f318a67..93d4b12e248e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -298,7 +298,7 @@ static int process_read_event(struct perf_tool *tool, struct report *rep = container_of(tool, struct report, tool); if (rep->show_threads) { - const char *name = evsel ? perf_evsel__name(evsel) : "unknown"; + const char *name = perf_evsel__name(evsel); int err = perf_read_values_add_value(&rep->show_threads_values, event->read.pid, event->read.tid, evsel->idx, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2e61dd6a3574..e3463df18493 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1246,7 +1246,7 @@ static void dump_read(struct perf_evsel *evsel, union perf_event *event) return; printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, - evsel ? perf_evsel__name(evsel) : "FAIL", + perf_evsel__name(evsel), event->read.value); if (!evsel) -- cgit v1.2.3 From 215a0d305c5651928eb67c96bcedd0a6c297dfce Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 4 Jul 2019 11:21:24 -0300 Subject: perf tools: Add missing headers, mostly stdlib.h Part of the erosion of util/util.h, that will lose its include stdlib.h, we need to add it to places where it is needed but was getting it indirectly. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-1imnqezw99ahc07fjeb51qby@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/common.c | 1 + tools/perf/arch/powerpc/util/perf_regs.c | 2 ++ tools/perf/arch/s390/util/header.c | 1 + tools/perf/builtin-config.c | 1 + tools/perf/builtin-help.c | 1 + tools/perf/tests/llvm.c | 1 + tools/perf/tests/sample-parsing.c | 1 + tools/perf/tests/vmlinux-kallsyms.c | 1 + tools/perf/ui/browser.h | 1 + tools/perf/ui/browsers/map.c | 1 + tools/perf/ui/tui/setup.c | 1 + tools/perf/ui/tui/util.c | 2 +- tools/perf/util/cputopo.c | 1 + tools/perf/util/db-export.c | 1 + tools/perf/util/debug.c | 1 + tools/perf/util/demangle-java.c | 3 ++- tools/perf/util/dwarf-aux.c | 2 +- tools/perf/util/env.c | 1 + tools/perf/util/parse-branch-options.c | 2 +- tools/perf/util/parse-regs-options.c | 8 ++++++-- tools/perf/util/strbuf.c | 1 + tools/perf/util/symbol-elf.c | 1 + tools/perf/util/symbol-minimal.c | 1 + tools/perf/util/target.c | 2 +- tools/perf/util/thread-stack.c | 1 + tools/perf/util/usage.c | 3 +++ 26 files changed, 35 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c index f3824ca7c20b..1bc329412bcf 100644 --- a/tools/perf/arch/common.c +++ b/tools/perf/arch/common.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "common.h" #include "../util/env.h" #include "../util/util.h" diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index 34d5134681d9..64f65c296d3e 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -8,6 +8,8 @@ #include "../../util/perf_regs.h" #include "../../util/debug.h" +#include + const struct sample_reg sample_reg_masks[] = { SMPL_REG(r0, PERF_REG_POWERPC_R0), SMPL_REG(r1, PERF_REG_POWERPC_R1), diff --git a/tools/perf/arch/s390/util/header.c b/tools/perf/arch/s390/util/header.c index a25896135abe..165c51435e72 100644 --- a/tools/perf/arch/s390/util/header.c +++ b/tools/perf/arch/s390/util/header.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../../util/header.h" #include "../../util/util.h" diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index d76f831f94c7..6c1284c87aaa 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -15,6 +15,7 @@ #include "util/debug.h" #include "util/config.h" #include +#include static bool use_system_config, use_user_config; diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 3d29d0524a89..6a1cab547043 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c index a039f93199e5..ca5a5f94ce79 100644 --- a/tools/perf/tests/llvm.c +++ b/tools/perf/tests/llvm.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 236ce0d6c826..361714e2583c 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index f101576d1c72..5e8834fc7dec 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "map.h" #include "symbol.h" #include "util.h" diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h index aa5932e1d62e..dc1444136658 100644 --- a/tools/perf/ui/browser.h +++ b/tools/perf/ui/browser.h @@ -4,6 +4,7 @@ #include #include +#include #define HE_COLORSET_TOP 50 #define HE_COLORSET_MEDIUM 51 diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index 5f6529c9eb8e..4c545b92e20d 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include "../../util/util.h" diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c index d4ac41679721..3ad0d3363ac6 100644 --- a/tools/perf/ui/tui/setup.c +++ b/tools/perf/ui/tui/setup.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #ifdef HAVE_BACKTRACE_SUPPORT #include diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c index b9794d6185af..fe5e571816fc 100644 --- a/tools/perf/ui/tui/util.c +++ b/tools/perf/ui/tui/util.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/util.h" #include #include #include +#include #include #include "../../util/cache.h" diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 26e73a4bd4fe..d3b2bd258b9e 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "cputopo.h" diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 2182f552aac6..4cdd1f579156 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -5,6 +5,7 @@ */ #include +#include #include "evsel.h" #include "machine.h" diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 3cc578343f48..3780fe42453b 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c index 5b4900d67c80..763328c151e9 100644 --- a/tools/perf/util/demangle-java.c +++ b/tools/perf/util/demangle-java.c @@ -1,14 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include -#include "util.h" #include "debug.h" #include "symbol.h" #include "demangle-java.h" #include +#include enum { MODE_PREFIX = 0, diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 218bfea8f8a8..03b2de1f5a35 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -6,7 +6,7 @@ #include #include #include -#include "util.h" +#include #include "debug.h" #include "dwarf-aux.h" #include "string2.h" diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 22eee8942527..7d317d49d207 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -7,6 +7,7 @@ #include #include #include +#include struct perf_env perf_env; diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c index bd779d9f4d1e..726e8d9e8c54 100644 --- a/tools/perf/util/parse-branch-options.c +++ b/tools/perf/util/parse-branch-options.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "perf.h" -#include "util/util.h" #include "util/debug.h" #include #include "util/parse-branch-options.h" +#include #define BRANCH_OPT(n, m) \ { .name = n, .mode = (m) } diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c index 08581e276225..ef46c2848808 100644 --- a/tools/perf/util/parse-regs-options.c +++ b/tools/perf/util/parse-regs-options.c @@ -1,8 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "perf.h" -#include "util/util.h" +#include +#include +#include +#include +#include #include "util/debug.h" #include +#include "util/perf_regs.h" #include "util/parse-regs-options.h" static int diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c index 23092fd6451d..54336df089df 100644 --- a/tools/perf/util/strbuf.c +++ b/tools/perf/util/strbuf.c @@ -3,6 +3,7 @@ #include "util.h" #include #include +#include /* * Used as the default ->buf value, so that people can always assume diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 62008756d8cc..429920978cb0 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 17edbd4f6f85..c8b7cadbc9c4 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c index 3852d07c49bd..3adc65480349 100644 --- a/tools/perf/util/target.c +++ b/tools/perf/util/target.c @@ -10,9 +10,9 @@ #include "debug.h" #include +#include #include - enum target_errno target__validate(struct target *target) { enum target_errno ret = TARGET_ERRNO__SUCCESS; diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 6ff1ff4d4ce7..48d585a0175c 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "thread.h" #include "event.h" #include "machine.h" diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c index 070d25ceea6a..3949a60b00ae 100644 --- a/tools/perf/util/usage.c +++ b/tools/perf/util/usage.c @@ -9,6 +9,9 @@ */ #include "util.h" #include "debug.h" +#include +#include +#include static __noreturn void usage_builtin(const char *err) { -- cgit v1.2.3 From 245aec7f7f4ca95b924f005d604bab9d838b5eb1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 5 Jul 2019 13:59:06 -0300 Subject: perf namespaces: Move the conditional setns() prototype to namespaces.h Out of util.h, to reduce its scope, and since we have a namespaces.h header, much better to have it there, where it is related to. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-zlu81bbtccuzygh7m8nmgybc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/namespaces.h | 4 ++++ tools/perf/util/setns.c | 4 +++- tools/perf/util/util.h | 4 ---- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h index 15a5a276c478..004430c0de93 100644 --- a/tools/perf/util/namespaces.h +++ b/tools/perf/util/namespaces.h @@ -13,6 +13,10 @@ #include #include +#ifndef HAVE_SETNS_SUPPORT +int setns(int fd, int nstype); +#endif + struct namespaces_event; struct namespaces { diff --git a/tools/perf/util/setns.c b/tools/perf/util/setns.c index ce8fc290fce8..48f9c0af63b2 100644 --- a/tools/perf/util/setns.c +++ b/tools/perf/util/setns.c @@ -1,4 +1,6 @@ -#include "util.h" +// SPDX-License-Identifier: LGPL-2.1 + +#include "namespaces.h" #include #include diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 125e215dd3d8..59fe33708090 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -67,10 +67,6 @@ char *get_current_dir_name(void); int sched_getcpu(void); #endif -#ifndef HAVE_SETNS_SUPPORT -int setns(int fd, int nstype); -#endif - extern bool perf_singlethreaded; void perf_set_singlethreaded(void); -- cgit v1.2.3 From e5653eb82ddc71ad8ffcbb3c74dd6f0c0230ab4c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 5 Jul 2019 14:16:15 -0300 Subject: perf tools: Move get_current_dir_name() cond prototype out of util.h And in a separate header, so that we erode util.h a bit more. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-xpzvuu9d0gei9jl9bkzgobln@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/get_current_dir_name.c | 6 +++--- tools/perf/util/get_current_dir_name.h | 8 ++++++++ tools/perf/util/namespaces.c | 1 + tools/perf/util/util.h | 4 ---- 4 files changed, 12 insertions(+), 7 deletions(-) create mode 100644 tools/perf/util/get_current_dir_name.h (limited to 'tools') diff --git a/tools/perf/util/get_current_dir_name.c b/tools/perf/util/get_current_dir_name.c index 267aa609a582..01f32f26552d 100644 --- a/tools/perf/util/get_current_dir_name.c +++ b/tools/perf/util/get_current_dir_name.c @@ -1,8 +1,8 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo +// SPDX-License-Identifier: LGPL-2.1 +// Copyright (C) 2018, 2019 Red Hat Inc, Arnaldo Carvalho de Melo // #ifndef HAVE_GET_CURRENT_DIR_NAME -#include "util.h" +#include "get_current_dir_name.h" #include #include #include diff --git a/tools/perf/util/get_current_dir_name.h b/tools/perf/util/get_current_dir_name.h new file mode 100644 index 000000000000..69f7d5537d32 --- /dev/null +++ b/tools/perf/util/get_current_dir_name.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 +// Copyright (C) 2018, 2019 Red Hat Inc, Arnaldo Carvalho de Melo +// +#ifndef __PERF_GET_CURRENT_DIR_NAME_H +#ifndef HAVE_GET_CURRENT_DIR_NAME +char *get_current_dir_name(void); +#endif // HAVE_GET_CURRENT_DIR_NAME +#endif // __PERF_GET_CURRENT_DIR_NAME_H diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c index 023c4efd788d..fda2fa1e8819 100644 --- a/tools/perf/util/namespaces.c +++ b/tools/perf/util/namespaces.c @@ -7,6 +7,7 @@ #include "namespaces.h" #include "util.h" #include "event.h" +#include "get_current_dir_name.h" #include #include #include diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 59fe33708090..cfc4d85bbd42 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -59,10 +59,6 @@ int fetch_kernel_version(unsigned int *puint, const char *perf_tip(const char *dirpath); -#ifndef HAVE_GET_CURRENT_DIR_NAME -char *get_current_dir_name(void); -#endif - #ifndef HAVE_SCHED_GETCPU_SUPPORT int sched_getcpu(void); #endif -- cgit v1.2.3 From 7f7c536f23e6afaa5d5d4b0e0958b0be8922491f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 4 Jul 2019 11:32:27 -0300 Subject: tools lib: Adopt zalloc()/zfree() from tools/perf Eroding a bit more the tools/perf/util/util.h hodpodge header. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-natazosyn9rwjka25tvcnyi0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/zalloc.h | 12 ++++++++++++ tools/lib/zalloc.c | 15 +++++++++++++++ tools/perf/MANIFEST | 1 + tools/perf/arch/arm/annotate/instructions.c | 1 + tools/perf/arch/arm/util/auxtrace.c | 1 + tools/perf/arch/arm/util/cs-etm.c | 1 + tools/perf/arch/arm64/util/arm-spe.c | 1 + tools/perf/arch/common.c | 2 +- tools/perf/arch/powerpc/util/perf_regs.c | 2 +- tools/perf/arch/s390/util/auxtrace.c | 1 + tools/perf/arch/s390/util/header.c | 2 +- tools/perf/arch/x86/util/event.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 2 +- tools/perf/arch/x86/util/perf_regs.c | 2 +- tools/perf/bench/mem-functions.c | 2 +- tools/perf/bench/numa.c | 2 +- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-bench.c | 2 +- tools/perf/builtin-c2c.c | 2 +- tools/perf/builtin-diff.c | 2 +- tools/perf/builtin-help.c | 1 + tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-lock.c | 2 +- tools/perf/builtin-probe.c | 2 +- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-sched.c | 2 +- tools/perf/builtin-script.c | 2 +- tools/perf/builtin-stat.c | 2 +- tools/perf/builtin-timechart.c | 4 +--- tools/perf/builtin-trace.c | 1 + tools/perf/perf.c | 2 +- tools/perf/tests/switch-tracking.c | 1 + tools/perf/ui/browser.c | 2 +- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/ui/browsers/hists.c | 2 +- tools/perf/ui/gtk/util.c | 3 +-- tools/perf/ui/stdio/hist.c | 2 +- tools/perf/util/Build | 5 +++++ tools/perf/util/arm-spe.c | 2 +- tools/perf/util/auxtrace.c | 2 +- tools/perf/util/bpf-loader.c | 1 + tools/perf/util/build-id.c | 1 + tools/perf/util/call-path.c | 3 ++- tools/perf/util/callchain.c | 2 +- tools/perf/util/cgroup.c | 2 +- tools/perf/util/comm.c | 2 +- tools/perf/util/config.c | 3 +-- tools/perf/util/counts.c | 2 +- tools/perf/util/cpumap.c | 2 +- tools/perf/util/cputopo.c | 2 +- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 1 + tools/perf/util/cs-etm.c | 1 + tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/data.c | 1 + tools/perf/util/db-export.c | 2 +- tools/perf/util/dso.c | 3 ++- tools/perf/util/env.c | 2 +- tools/perf/util/event.c | 1 + tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 2 +- tools/perf/util/header.c | 2 +- tools/perf/util/help-unknown-cmd.c | 2 ++ tools/perf/util/hist.c | 2 +- tools/perf/util/intel-bts.c | 2 +- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 2 +- tools/perf/util/intel-pt.c | 2 +- tools/perf/util/llvm-utils.c | 1 + tools/perf/util/machine.c | 2 +- tools/perf/util/map.c | 2 +- tools/perf/util/mem2node.c | 2 +- tools/perf/util/mmap.c | 1 + tools/perf/util/namespaces.c | 2 +- tools/perf/util/parse-events.c | 1 + tools/perf/util/pmu.c | 2 +- tools/perf/util/probe-event.c | 2 +- tools/perf/util/probe-file.c | 2 +- tools/perf/util/probe-finder.c | 2 +- tools/perf/util/pstack.c | 2 +- tools/perf/util/python-ext-sources | 1 + tools/perf/util/s390-cpumsf.c | 2 +- tools/perf/util/session.c | 2 +- tools/perf/util/srcline.c | 2 +- tools/perf/util/stat.c | 1 + tools/perf/util/strbuf.c | 2 +- tools/perf/util/strfilter.c | 3 ++- tools/perf/util/strlist.c | 2 +- tools/perf/util/svghelper.c | 2 +- tools/perf/util/symbol-elf.c | 1 + tools/perf/util/symbol-minimal.c | 2 +- tools/perf/util/symbol.c | 1 + tools/perf/util/syscalltbl.c | 2 +- tools/perf/util/thread-stack.c | 2 +- tools/perf/util/thread.c | 2 +- tools/perf/util/thread_map.c | 2 +- tools/perf/util/trace-event-info.c | 1 + tools/perf/util/trace-event-scripting.c | 2 +- tools/perf/util/unwind-libdw.c | 1 + tools/perf/util/util.h | 9 --------- tools/perf/util/values.c | 2 +- tools/perf/util/vdso.c | 1 + tools/perf/util/xyarray.c | 2 +- 103 files changed, 135 insertions(+), 86 deletions(-) create mode 100644 tools/include/linux/zalloc.h create mode 100644 tools/lib/zalloc.c (limited to 'tools') diff --git a/tools/include/linux/zalloc.h b/tools/include/linux/zalloc.h new file mode 100644 index 000000000000..81099c84043f --- /dev/null +++ b/tools/include/linux/zalloc.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: LGPL-2.1 +#ifndef __TOOLS_LINUX_ZALLOC_H +#define __TOOLS_LINUX_ZALLOC_H + +#include + +void *zalloc(size_t size); +void __zfree(void **ptr); + +#define zfree(ptr) __zfree((void **)(ptr)) + +#endif // __TOOLS_LINUX_ZALLOC_H diff --git a/tools/lib/zalloc.c b/tools/lib/zalloc.c new file mode 100644 index 000000000000..9c856d59f56e --- /dev/null +++ b/tools/lib/zalloc.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: LGPL-2.1 + +#include +#include + +void *zalloc(size_t size) +{ + return calloc(1, size); +} + +void __zfree(void **ptr) +{ + free(*ptr); + *ptr = NULL; +} diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 6a5de44b2de9..70f1ff4e2eb4 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -18,3 +18,4 @@ tools/lib/find_bit.c tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c +tools/lib/zalloc.c diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c index f64516d5b23e..c7d1a69b894f 100644 --- a/tools/perf/arch/arm/annotate/instructions.c +++ b/tools/perf/arch/arm/annotate/instructions.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index 1ce6bdbda561..02014740a1aa 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -6,6 +6,7 @@ #include #include +#include #include "../../util/auxtrace.h" #include "../../util/evlist.h" diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 2b83cc8e4796..4208974c24f8 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cs-etm.h" #include "../../perf.h" diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 5ccfce87e693..2c009aa74633 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../../util/cpumap.h" diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c index 1bc329412bcf..1a9e22f78c22 100644 --- a/tools/perf/arch/common.c +++ b/tools/perf/arch/common.c @@ -3,8 +3,8 @@ #include #include "common.h" #include "../util/env.h" -#include "../util/util.h" #include "../util/debug.h" +#include const char *const arc_triplets[] = { "arc-linux-", diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index 64f65c296d3e..f14102b85509 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -2,9 +2,9 @@ #include #include #include +#include #include "../../perf.h" -#include "../../util/util.h" #include "../../util/perf_regs.h" #include "../../util/debug.h" diff --git a/tools/perf/arch/s390/util/auxtrace.c b/tools/perf/arch/s390/util/auxtrace.c index 44c857388897..0fe1be93f375 100644 --- a/tools/perf/arch/s390/util/auxtrace.c +++ b/tools/perf/arch/s390/util/auxtrace.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "../../util/evlist.h" #include "../../util/auxtrace.h" diff --git a/tools/perf/arch/s390/util/header.c b/tools/perf/arch/s390/util/header.c index 165c51435e72..8b0b018d896a 100644 --- a/tools/perf/arch/s390/util/header.c +++ b/tools/perf/arch/s390/util/header.c @@ -13,9 +13,9 @@ #include #include #include +#include #include "../../util/header.h" -#include "../../util/util.h" #define SYSINFO_MANU "Manufacturer:" #define SYSINFO_TYPE "Type:" diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index 675a0213044d..a3a0b6884779 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "../../util/machine.h" #include "../../util/tool.h" #include "../../util/map.h" -#include "../../util/util.h" #include "../../util/debug.h" #if defined(__x86_64__) diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index e6d4d9591c79..ec5c1bb84095 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -9,12 +9,12 @@ #include #include #include +#include #include "../../util/cpumap.h" #include "../../util/evsel.h" #include "../../util/evlist.h" #include "../../util/session.h" -#include "../../util/util.h" #include "../../util/pmu.h" #include "../../util/debug.h" #include "../../util/tsc.h" diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 9804098dcefb..609088c01e3a 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "../../perf.h" @@ -25,7 +26,6 @@ #include "../../util/auxtrace.h" #include "../../util/tsc.h" #include "../../util/intel-pt.h" -#include "../../util/util.h" #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 3666c0076df9..0d7b77ff0ae6 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -2,9 +2,9 @@ #include #include #include +#include #include "../../perf.h" -#include "../../util/util.h" #include "../../util/perf_regs.h" #include "../../util/debug.h" diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 0251dd348124..64dc994c72ea 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -9,7 +9,6 @@ #include "debug.h" #include "../perf.h" -#include "../util/util.h" #include #include "../util/header.h" #include "../util/cloexec.h" @@ -24,6 +23,7 @@ #include #include #include +#include #define K 1024 diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index a7784554a80d..a640ca7aaada 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -11,7 +11,6 @@ #include "../perf.h" #include "../builtin.h" -#include "../util/util.h" #include #include "../util/cloexec.h" @@ -35,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 77deb3a40596..e0aa14faf2b5 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -8,11 +8,11 @@ */ #include "builtin.h" -#include "util/util.h" #include "util/color.h" #include #include "util/cache.h" #include +#include #include "util/symbol.h" #include "perf.h" diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 334c77ffc1d9..b8e7c38ef221 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -17,7 +17,6 @@ * epoll ... Event poll performance */ #include "perf.h" -#include "util/util.h" #include #include "builtin.h" #include "bench/bench.h" @@ -26,6 +25,7 @@ #include #include #include +#include typedef int (*bench_fn_t)(int argc, const char **argv); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 9e6cc868bdb4..e3776f5c2e01 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -15,9 +15,9 @@ #include #include #include +#include #include #include -#include "util.h" #include "debug.h" #include "builtin.h" #include diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index f924b46910b5..f6f5dd15bea7 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -16,12 +16,12 @@ #include "util/tool.h" #include "util/sort.h" #include "util/symbol.h" -#include "util/util.h" #include "util/data.h" #include "util/config.h" #include "util/time-utils.h" #include "util/annotate.h" #include "util/map.h" +#include #include #include diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 6a1cab547043..a83af92fb0d1 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -14,6 +14,7 @@ #include #include "util/debug.h" #include +#include #include #include #include diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9bd3829de76d..9e5e60898083 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -4,7 +4,6 @@ #include "util/evlist.h" #include "util/evsel.h" -#include "util/util.h" #include "util/config.h" #include "util/map.h" #include "util/symbol.h" @@ -26,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index dbb6f737a3e2..b33c83489120 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -5,7 +5,6 @@ #include "util/evsel.h" #include "util/evlist.h" #include "util/term.h" -#include "util/util.h" #include "util/cache.h" #include "util/symbol.h" #include "util/thread.h" @@ -32,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index b9810a8d350a..c0be44e65e9d 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -6,7 +6,6 @@ #include "util/evlist.h" #include "util/evsel.h" -#include "util/util.h" #include "util/cache.h" #include "util/symbol.h" #include "util/thread.h" @@ -30,6 +29,7 @@ #include #include #include +#include static struct perf_session *session; diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 8bb124e55c6d..6418782951a4 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -19,7 +19,6 @@ #include "perf.h" #include "builtin.h" #include "namespaces.h" -#include "util/util.h" #include "util/strlist.h" #include "util/strfilter.h" #include "util/symbol.h" @@ -28,6 +27,7 @@ #include "util/probe-finder.h" #include "util/probe-event.h" #include "util/probe-file.h" +#include #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" #define DEFAULT_FUNC_FILTER "!_*" diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 93d4b12e248e..abf0b9b8f566 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -8,7 +8,6 @@ */ #include "builtin.h" -#include "util/util.h" #include "util/config.h" #include "util/annotate.h" @@ -16,6 +15,7 @@ #include #include #include +#include #include "util/map.h" #include "util/symbol.h" #include "util/callchain.h" diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1519989961ff..56d1907b1215 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2,7 +2,6 @@ #include "builtin.h" #include "perf.h" -#include "util/util.h" #include "util/evlist.h" #include "util/cache.h" #include "util/evsel.h" @@ -26,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2f6232f1bfdc..b3536820f9a8 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -14,7 +14,6 @@ #include "util/symbol.h" #include "util/thread.h" #include "util/trace-event.h" -#include "util/util.h" #include "util/evlist.h" #include "util/evsel.h" #include "util/sort.h" @@ -34,6 +33,7 @@ #include #include #include +#include #include #include "asm/bug.h" #include "util/mem-events.h" diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b81f7b197d24..c72f4a0831a8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -43,7 +43,6 @@ #include "perf.h" #include "builtin.h" #include "util/cgroup.h" -#include "util/util.h" #include #include "util/parse-events.h" #include "util/pmu.h" @@ -67,6 +66,7 @@ #include "asm/bug.h" #include +#include #include #include #include diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 145a19668114..4bde3fa245d1 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -13,9 +13,6 @@ #include #include "builtin.h" - -#include "util/util.h" - #include "util/color.h" #include #include "util/cache.h" @@ -24,6 +21,7 @@ #include #include #include +#include #include "util/symbol.h" #include "util/thread.h" #include "util/callchain.h" diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index e3fc9062f136..1aa2ed096f65 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 2123b3cc4dcf..97e2628ea5dd 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -18,7 +18,6 @@ #include "util/bpf-loader.h" #include "util/debug.h" #include "util/event.h" -#include "util/util.h" #include #include #include @@ -30,6 +29,7 @@ #include #include #include +#include const char perf_usage_string[] = "perf [--version] [--help] [OPTIONS] COMMAND [ARGS]"; diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 9b5be51e5e7b..744409dce65f 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "parse-events.h" #include "evlist.h" diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 55ff05a46e0b..f80c51d53565 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../util.h" #include "../string2.h" #include "../config.h" #include "../../perf.h" @@ -17,6 +16,7 @@ #include "keysyms.h" #include "../color.h" #include +#include static int ui_browser__percent_color(struct ui_browser *browser, double percent, bool current) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index b0d089a95dac..e67880bf1efe 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/util.h" #include "../browser.h" #include "../helpline.h" #include "../ui.h" @@ -15,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 33e67aa91347..85581cfb9112 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "../../util/callchain.h" #include "../../util/evsel.h" @@ -18,7 +19,6 @@ #include "../../util/symbol.h" #include "../../util/pstack.h" #include "../../util/sort.h" -#include "../../util/util.h" #include "../../util/top.h" #include "../../util/thread.h" #include "../../arch/common.h" diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c index 7250d8101c8f..c28bdb7517ac 100644 --- a/tools/perf/ui/gtk/util.c +++ b/tools/perf/ui/gtk/util.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "../util.h" -#include "../../util/util.h" #include "../../util/debug.h" #include "gtk.h" #include - +#include struct perf_gtk_context *pgctx; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 89393c79d870..ee7ea6deed21 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -3,7 +3,6 @@ #include #include "../../util/callchain.h" -#include "../../util/util.h" #include "../../util/hist.h" #include "../../util/map.h" #include "../../util/map_groups.h" @@ -14,6 +13,7 @@ #include "../../util/string2.h" #include "../../util/thread.h" #include +#include static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin) { diff --git a/tools/perf/util/Build b/tools/perf/util/Build index d3408a463060..d7e3b008a613 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -25,6 +25,7 @@ perf-y += rbtree.o perf-y += libstring.o perf-y += bitmap.o perf-y += hweight.o +perf-y += zalloc.o perf-y += smt.o perf-y += strbuf.o perf-y += string.o @@ -241,3 +242,7 @@ $(OUTPUT)util/hweight.o: ../lib/hweight.c FORCE $(OUTPUT)util/vsprintf.o: ../lib/vsprintf.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) + +$(OUTPUT)util/zalloc.o: ../lib/zalloc.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 6067267cc76c..a314e5b26e9d 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cpumap.h" #include "color.h" @@ -19,7 +20,6 @@ #include "evlist.h" #include "machine.h" #include "session.h" -#include "util.h" #include "thread.h" #include "debug.h" #include "auxtrace.h" diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index bc215fe0b4b4..0812a11a0dbe 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -24,9 +24,9 @@ #include #include #include +#include #include "../perf.h" -#include "util.h" #include "evlist.h" #include "dso.h" #include "map.h" diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 251d9ea6252f..93d0f239ad4f 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "perf.h" #include "debug.h" diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 89c6913dfc25..f1abfab7aa8c 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -30,6 +30,7 @@ #include "strlist.h" #include +#include static bool no_buildid_cache; diff --git a/tools/perf/util/call-path.c b/tools/perf/util/call-path.c index c5b90300304d..e8a80c41cba3 100644 --- a/tools/perf/util/call-path.c +++ b/tools/perf/util/call-path.c @@ -6,8 +6,9 @@ #include #include +#include +#include -#include "util.h" #include "call-path.h" static void call_path__init(struct call_path *cp, struct call_path *parent, diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index abb608b09269..b4af25dca5eb 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -16,11 +16,11 @@ #include #include #include +#include #include "asm/bug.h" #include "hist.h" -#include "util.h" #include "sort.h" #include "machine.h" #include "map.h" diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index ccd02634a616..f505d78f059b 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "util.h" #include "../perf.h" #include #include "evsel.h" #include "cgroup.h" #include "evlist.h" #include +#include #include #include #include diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index 1066de92af12..afb8d4fd2644 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "comm.h" -#include "util.h" #include #include #include #include #include #include +#include #include "rwsem.h" struct comm_str { diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 752cce853e51..042ffbc8c53f 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -11,7 +11,6 @@ */ #include #include -#include "util.h" #include "cache.h" #include "callchain.h" #include @@ -23,7 +22,7 @@ #include #include #include - +#include #include #define MAXNAME (256) diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c index 03032b410c29..88be9c4365e0 100644 --- a/tools/perf/util/counts.c +++ b/tools/perf/util/counts.c @@ -3,7 +3,7 @@ #include #include "evsel.h" #include "counts.h" -#include "util.h" +#include struct perf_counts *perf_counts__new(int ncpus, int nthreads) { diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 0d8fbedf7bd5..3acfbe34ebaf 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "util.h" #include #include "../perf.h" #include "cpumap.h" @@ -11,6 +10,7 @@ #include "asm/bug.h" #include +#include static int max_cpu_num; static int max_present_cpu_num; diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index d3b2bd258b9e..fa1778aee5d6 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -4,10 +4,10 @@ #include #include #include +#include #include "cputopo.h" #include "cpumap.h" -#include "util.h" #include "env.h" #define CORE_SIB_FMT \ diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index bb45e23018ee..37d7c492b155 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 0c7776b51045..d92516edbead 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 7b06e7373b9e..1e93f2e94c40 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include "asm/bug.h" #include "data-convert-bt.h" #include "session.h" -#include "util.h" #include "debug.h" #include "tool.h" #include "evlist.h" diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 6a64f713710d..df7e000e19ea 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 4cdd1f579156..3f2694ccfac7 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -14,11 +14,11 @@ #include "symbol.h" #include "map.h" #include "event.h" -#include "util.h" #include "thread-stack.h" #include "callchain.h" #include "call-path.h" #include "db-export.h" +#include struct deferred_export { struct list_head node; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index c7fde04400f7..ebacf07fc9ee 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -21,7 +22,7 @@ #include "dso.h" #include "machine.h" #include "auxtrace.h" -#include "util.h" +#include "util.h" /* O_CLOEXEC for older systems */ #include "debug.h" #include "string2.h" #include "vdso.h" diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 7d317d49d207..f92d992bd2db 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -2,7 +2,7 @@ #include "cpumap.h" #include "env.h" #include -#include "util.h" +#include #include "bpf-event.h" #include #include diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index e1d0c5ba1f92..7524bda5140b 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -11,6 +11,7 @@ #include /* To get things like MAP_HUGETLB even on older libc headers */ #include #include +#include #include "event.h" #include "debug.h" #include "hist.h" diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a474ede17cd6..b0364d923f76 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -5,7 +5,6 @@ * Parts came from builtin-{top,stat,record}.c, see those files for further * copyright notes. */ -#include "util.h" #include #include #include @@ -33,6 +32,7 @@ #include #include #include +#include #ifdef LACKS_SIGQUEUE_PROTOTYPE int sigqueue(pid_t pid, int sig, const union sigval value); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 7fb4ae82f34c..7ede674edf07 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include "event.h" #include "evsel.h" #include "evlist.h" -#include "util.h" #include "cpumap.h" #include "thread_map.h" #include "target.h" diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 6a93ff5d8db5..4e2efaa50c2f 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include "util.h" #include "string2.h" #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index 4f07a5ba5030..ab9e16123626 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -3,9 +3,11 @@ #include "config.h" #include #include +#include #include #include "../builtin.h" #include "levenshtein.h" +#include static int autocorrect; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 27cecb59f866..bb1d77331add 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "callchain.h" -#include "util.h" #include "build-id.h" #include "hist.h" #include "map.h" @@ -20,6 +19,7 @@ #include #include #include +#include static bool hists__filter_entry_by_dso(struct hists *hists, struct hist_entry *he); diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index e32dbffebb2f..5a21bcdb8ef7 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cpumap.h" #include "color.h" @@ -21,7 +22,6 @@ #include "map.h" #include "symbol.h" #include "session.h" -#include "util.h" #include "thread.h" #include "thread-stack.h" #include "debug.h" diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 4d14e78c5927..3bfdf2b7a96a 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -14,9 +14,9 @@ #include #include #include +#include #include "../cache.h" -#include "../util.h" #include "../auxtrace.h" #include "intel-pt-insn-decoder.h" diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 470aaae9d930..c76a96f777fb 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "../perf.h" #include "session.h" @@ -22,7 +23,6 @@ #include "evsel.h" #include "map.h" #include "color.h" -#include "util.h" #include "thread.h" #include "thread-stack.h" #include "symbol.h" diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 5b0b60f00275..b9fddb809d58 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "debug.h" #include "llvm-utils.h" #include "config.h" diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 147ed85ea2bc..f523da3009e4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -15,7 +15,6 @@ #include "strlist.h" #include "thread.h" #include "vdso.h" -#include "util.h" #include #include #include @@ -28,6 +27,7 @@ #include #include #include +#include static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 5f87975d2562..668410b1d426 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -12,10 +12,10 @@ #include "thread.h" #include "vdso.h" #include "build-id.h" -#include "util.h" #include "debug.h" #include "machine.h" #include +#include #include "srcline.h" #include "namespaces.h" #include "unwind.h" diff --git a/tools/perf/util/mem2node.c b/tools/perf/util/mem2node.c index c6fd81c02586..cacc2fc4dcbd 100644 --- a/tools/perf/util/mem2node.c +++ b/tools/perf/util/mem2node.c @@ -1,8 +1,8 @@ #include #include #include +#include #include "mem2node.h" -#include "util.h" struct phys_entry { struct rb_node rb_node; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 768c632b0d82..9f0b6391af33 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef HAVE_LIBNUMA_SUPPORT #include #endif diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c index fda2fa1e8819..46d3a7754897 100644 --- a/tools/perf/util/namespaces.c +++ b/tools/perf/util/namespaces.c @@ -5,7 +5,6 @@ */ #include "namespaces.h" -#include "util.h" #include "event.h" #include "get_current_dir_name.h" #include @@ -18,6 +17,7 @@ #include #include #include +#include struct namespaces *namespaces__new(struct namespaces_event *event) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index cf0b9b81c5aa..aa439853f20a 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 55f4de6442e3..12b677902fbc 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -14,7 +15,6 @@ #include #include #include -#include "util.h" #include "pmu.h" #include "parse-events.h" #include "cpumap.h" diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 6f24eaf6e504..80c0eca0f1ee 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -19,7 +19,6 @@ #include #include -#include "util.h" #include "event.h" #include "namespaces.h" #include "strlist.h" @@ -40,6 +39,7 @@ #include "string2.h" #include +#include #define PERFPROBE_GROUP "probe" diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index 0ed1900454eb..c2998f90b23c 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -10,8 +10,8 @@ #include #include #include +#include #include "namespaces.h" -#include "util.h" #include "event.h" #include "strlist.h" #include "strfilter.h" diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 6b40cc691a2d..7d8c99734928 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -19,11 +19,11 @@ #include #include +#include #include "event.h" #include "dso.h" #include "debug.h" #include "intlist.h" -#include "util.h" #include "strlist.h" #include "symbol.h" #include "probe-finder.h" diff --git a/tools/perf/util/pstack.c b/tools/perf/util/pstack.c index 797fe1ae2d2e..28de8a4c2ce8 100644 --- a/tools/perf/util/pstack.c +++ b/tools/perf/util/pstack.c @@ -5,10 +5,10 @@ * (c) 2010 Arnaldo Carvalho de Melo */ -#include "util.h" #include "pstack.h" #include "debug.h" #include +#include #include struct pstack { diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 2237bac9fadb..ceb8afdf9a89 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -18,6 +18,7 @@ util/namespaces.c ../lib/hweight.c ../lib/string.c ../lib/vsprintf.c +../lib/zalloc.c util/thread_map.c util/util.c util/xyarray.c diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 10d36d9b7909..ea669702825d 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -146,6 +146,7 @@ #include #include #include +#include #include #include @@ -156,7 +157,6 @@ #include "evlist.h" #include "machine.h" #include "session.h" -#include "util.h" #include "thread.h" #include "debug.h" #include "auxtrace.h" diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e3463df18493..d0fd6c614e68 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -18,7 +19,6 @@ #include "session.h" #include "tool.h" #include "sort.h" -#include "util.h" #include "cpumap.h" #include "perf_regs.h" #include "asm/bug.h" diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index dcad75daf5e4..6ccf6f6d09df 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -6,9 +6,9 @@ #include #include +#include #include "util/dso.h" -#include "util/util.h" #include "util/debug.h" #include "util/callchain.h" #include "util/symbol_conf.h" diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index d91fe754b6d2..c967715c1d4c 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -6,6 +6,7 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" +#include void update_stats(struct stats *stats, u64 val) { diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c index 54336df089df..2ce0dc887364 100644 --- a/tools/perf/util/strbuf.c +++ b/tools/perf/util/strbuf.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "debug.h" -#include "util.h" #include +#include #include #include diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c index 90ea2b209cbb..78aa4c3b990d 100644 --- a/tools/perf/util/strfilter.c +++ b/tools/perf/util/strfilter.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "util.h" #include "string2.h" #include "strfilter.h" #include +#include #include #include +#include /* Operators */ static const char *OP_and = "&"; /* Logical AND */ diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index af45c6fd97db..8a868cbeffae 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -4,12 +4,12 @@ */ #include "strlist.h" -#include "util.h" #include #include #include #include #include +#include static struct rb_node *strlist__node_new(struct rblist *rblist, const void *entry) diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index fab8a048d31b..76cc54000483 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -15,10 +15,10 @@ #include #include #include +#include #include "perf.h" #include "svghelper.h" -#include "util.h" #include "cpumap.h" static u64 first_time, last_time; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 429920978cb0..ad683fbe9678 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -17,6 +17,7 @@ #include "debug.h" #include "util.h" #include +#include #include #ifndef EM_AARCH64 diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index c8b7cadbc9c4..3bc8b7e3300e 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -10,7 +10,7 @@ #include #include #include - +#include static bool check_need_swap(int file_endian) { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index ae2ce255e848..173f3378aaa0 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -26,6 +26,7 @@ #include "header.h" #include "path.h" #include +#include #include #include diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index c2037ac533f3..022a9c670338 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -10,9 +10,9 @@ #include #ifdef HAVE_SYSCALL_TABLE_SUPPORT +#include #include #include "string2.h" -#include "util.h" #if defined(__x86_64__) #include diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 48d585a0175c..15134ac9b8f1 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -7,13 +7,13 @@ #include #include #include +#include #include #include #include "thread.h" #include "event.h" #include "machine.h" #include "env.h" -#include "util.h" #include "debug.h" #include "symbol.h" #include "comm.h" diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 3e29a4e8b5e6..bbfb9c767f5f 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -5,10 +5,10 @@ #include #include #include +#include #include "session.h" #include "thread.h" #include "thread-stack.h" -#include "util.h" #include "debug.h" #include "namespaces.h" #include "comm.h" diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 281bf06f10f2..c291874352cf 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -13,9 +13,9 @@ #include #include #include +#include #include "asm/bug.h" #include "thread_map.h" -#include "util.h" #include "debug.h" #include "event.h" diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 806a11b334d3..4550015b9d5d 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../perf.h" #include "trace-event.h" diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c index b023db136ef3..ba58f69777a1 100644 --- a/tools/perf/util/trace-event-scripting.c +++ b/tools/perf/util/trace-event-scripting.c @@ -12,8 +12,8 @@ #include "../perf.h" #include "debug.h" -#include "util.h" #include "trace-event.h" +#include struct scripting_context *scripting_context; diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 407d0167b942..28f71ca6ce1c 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -12,6 +12,7 @@ #include "symbol.h" #include "thread.h" #include +#include #include "event.h" #include "perf_regs.h" #include "callchain.h" diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index cfc4d85bbd42..dc7a469921e9 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include #include @@ -18,13 +16,6 @@ void usage(const char *err) __noreturn; void die(const char *err, ...) __noreturn __printf(1, 2); -static inline void *zalloc(size_t size) -{ - return calloc(1, size); -} - -#define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) - struct dirent; struct nsinfo; struct strlist; diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c index 4b7a303e4ba8..c59154e2d124 100644 --- a/tools/perf/util/values.c +++ b/tools/perf/util/values.c @@ -3,8 +3,8 @@ #include #include #include +#include -#include "util.h" #include "values.h" #include "debug.h" diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 5031b7b22bbd..7f427bab6c12 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -16,6 +16,7 @@ #include "machine.h" #include "thread.h" #include "linux/string.h" +#include #include "debug.h" /* diff --git a/tools/perf/util/xyarray.c b/tools/perf/util/xyarray.c index dc95154f5646..86889ebc3514 100644 --- a/tools/perf/util/xyarray.c +++ b/tools/perf/util/xyarray.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "xyarray.h" -#include "util.h" #include #include +#include struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size) { -- cgit v1.2.3 From d8f9da240495b50766239410f9b0c715ca506a67 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 4 Jul 2019 12:06:20 -0300 Subject: perf tools: Use zfree() where applicable In places where the equivalent was already being done, i.e.: free(a); a = NULL; And in placs where struct members are being freed so that if we have some erroneous reference to its struct, then accesses to freed members will result in segfaults, which we can detect faster than use after free to areas that may still have something seemingly valid. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-jatyoofo5boc1bsvoig6bb6i@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 3 +- tools/perf/bench/futex-lock-pi.c | 3 +- tools/perf/builtin-record.c | 4 +-- tools/perf/builtin-stat.c | 4 +-- tools/perf/tests/dwarf-unwind.c | 5 ++-- tools/perf/tests/expr.c | 3 +- tools/perf/tests/mem2node.c | 3 +- tools/perf/tests/thread-map.c | 3 +- tools/perf/ui/browsers/res_sample.c | 6 ++-- tools/perf/ui/browsers/scripts.c | 4 +-- tools/perf/util/annotate.c | 3 +- tools/perf/util/auxtrace.c | 5 ++-- tools/perf/util/cgroup.c | 2 +- tools/perf/util/cputopo.c | 2 +- tools/perf/util/cs-etm.c | 5 ++-- tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/data.c | 2 +- tools/perf/util/env.c | 8 ++--- tools/perf/util/event.c | 2 +- tools/perf/util/header.c | 6 ++-- tools/perf/util/hist.c | 14 ++++----- tools/perf/util/jitdump.c | 7 ++--- tools/perf/util/llvm-utils.c | 3 +- tools/perf/util/machine.c | 4 +-- tools/perf/util/metricgroup.c | 9 +++--- tools/perf/util/probe-event.c | 51 +++++++++++++++----------------- tools/perf/util/s390-cpumsf.c | 7 ++--- tools/perf/util/srccode.c | 9 +++--- tools/perf/util/stat-shadow.c | 3 +- tools/perf/util/stat.c | 2 +- tools/perf/util/symbol-elf.c | 10 +++---- tools/perf/util/thread_map.c | 2 +- tools/perf/util/unwind-libunwind-local.c | 3 +- 33 files changed, 100 insertions(+), 99 deletions(-) (limited to 'tools') diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 9aa3a674829b..a80797763e1f 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "../util/stat.h" @@ -214,7 +215,7 @@ int bench_futex_hash(int argc, const char **argv) &worker[i].futex[nfutexes-1], t); } - free(worker[i].futex); + zfree(&worker[i].futex); } print_summary(); diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 8e9c4753e304..d02330a69745 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "bench.h" #include "futex.h" @@ -217,7 +218,7 @@ int bench_futex_lock_pi(int argc, const char **argv) worker[i].tid, worker[i].futex, t); if (multi) - free(worker[i].futex); + zfree(&worker[i].futex); } print_summary(); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index dca55997934e..8779cee58185 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -11,7 +11,6 @@ #include "perf.h" #include "util/build-id.h" -#include "util/util.h" #include #include "util/parse-events.h" #include "util/config.h" @@ -54,6 +53,7 @@ #include #include #include +#include struct switch_output { bool enabled; @@ -1110,7 +1110,7 @@ record__switch_output(struct record *rec, bool at_exit) rec->switch_output.cur_file = n; if (rec->switch_output.filenames[n]) { remove(rec->switch_output.filenames[n]); - free(rec->switch_output.filenames[n]); + zfree(&rec->switch_output.filenames[n]); } rec->switch_output.filenames[n] = new_filename; } else { diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c72f4a0831a8..b55a534b4de0 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1586,7 +1586,7 @@ static void runtime_stat_delete(struct perf_stat_config *config) for (i = 0; i < config->stats_num; i++) runtime_stat__exit(&config->stats[i]); - free(config->stats); + zfree(&config->stats); } static const char * const stat_report_usage[] = { @@ -2003,7 +2003,7 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: - free(stat_config.walltime_run); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) sysfs__write_int(FREEZE_ON_SMI_PATH, 0); diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index 077c306c1cae..f33709a79335 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include "tests.h" @@ -115,8 +116,8 @@ noinline int test_dwarf_unwind__thread(struct thread *thread) } out: - free(sample.user_stack.data); - free(sample.user_regs.regs); + zfree(&sample.user_stack.data); + zfree(&sample.user_regs.regs); return err; } diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 9acc1e80b936..ee1d88650e69 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -3,6 +3,7 @@ #include "util/expr.h" #include "tests.h" #include +#include static int test(struct parse_ctx *ctx, const char *e, double val2) { @@ -58,7 +59,7 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) TEST_ASSERT_VAL("find other", other[3] == NULL); for (i = 0; i < num_other; i++) - free((void *)other[i]); + zfree(&other[i]); free((void *)other); return 0; diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index d23ff1b68eba..520cc91af256 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "cpumap.h" #include "mem2node.h" #include "tests.h" @@ -67,7 +68,7 @@ int test__mem2node(struct test *t __maybe_unused, int subtest __maybe_unused) T("failed: mem2node__node", -1 == mem2node__node(&map, 0x1050)); for (i = 0; i < ARRAY_SIZE(nodes); i++) - free(nodes[i].set); + zfree(&nodes[i].set); mem2node__exit(&map); return 0; diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index 4de1939b58ba..ccc17aced49e 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -6,6 +6,7 @@ #include "tests.h" #include "thread_map.h" #include "debug.h" +#include #define NAME (const char *) "perf" #define NAMEUL (unsigned long) NAME @@ -133,7 +134,7 @@ int test__thread_map_remove(struct test *test __maybe_unused, int subtest __mayb thread_map__remove(threads, 0)); for (i = 0; i < threads->nr; i++) - free(threads->map[i].comm); + zfree(&threads->map[i].comm); free(threads); return 0; diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c index c0dd73176d42..8aa3547bb9ff 100644 --- a/tools/perf/ui/browsers/res_sample.c +++ b/tools/perf/ui/browsers/res_sample.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* Display a menu with individual samples to browse with perf script */ -#include "util.h" #include "hist.h" #include "evsel.h" #include "hists.h" @@ -8,6 +7,7 @@ #include "config.h" #include "time-utils.h" #include +#include static u64 context_len = 10 * NSEC_PER_MSEC; @@ -46,14 +46,14 @@ int res_sample_browse(struct res_sample *res_samples, int num_res, if (asprintf(&names[i], "%s: CPU %d tid %d", tbuf, res_samples[i].cpu, res_samples[i].tid) < 0) { while (--i >= 0) - free(names[i]); + zfree(&names[i]); free(names); return -1; } } choice = ui__popup_menu(num_res, names); for (i = 0; i < num_res; i++) - free(names[i]); + zfree(&names[i]); free(names); if (choice < 0 || choice >= num_res) diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 27cf3ab88d13..4d565cc14076 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "../../util/sort.h" -#include "../../util/util.h" #include "../../util/hist.h" #include "../../util/debug.h" #include "../../util/symbol.h" #include "../browser.h" #include "../libslang.h" #include "config.h" +#include #define SCRIPT_NAMELEN 128 #define SCRIPT_MAX_NO 64 @@ -142,7 +142,7 @@ static int list_scripts(char *script_name, bool *custom, out: free(buf); for (i = 0; i < max_std; i++) - free(paths[i]); + zfree(&paths[i]); return ret; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 944a6507a5e3..ef0e6028684c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1235,8 +1235,7 @@ void disasm_line__free(struct disasm_line *dl) dl->ins.ops->free(&dl->ops); else ins__delete(&dl->ops); - free((void *)dl->ins.name); - dl->ins.name = NULL; + zfree(&dl->ins.name); annotation_line__delete(&dl->al); } diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 0812a11a0dbe..b033a43dfe3b 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1413,7 +1413,7 @@ void auxtrace_cache__free(struct auxtrace_cache *c) return; auxtrace_cache__drop(c); - free(c->hashtable); + zfree(&c->hashtable); free(c); } @@ -1459,12 +1459,11 @@ void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key) static void addr_filter__free_str(struct addr_filter *filt) { - free(filt->str); + zfree(&filt->str); filt->action = NULL; filt->sym_from = NULL; filt->sym_to = NULL; filt->filename = NULL; - filt->str = NULL; } static struct addr_filter *addr_filter__new(void) diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index f505d78f059b..484c29830a81 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -124,7 +124,7 @@ static struct cgroup *cgroup__new(const char *name) return cgroup; out_free_name: - free(cgroup->name); + zfree(&cgroup->name); out_err: free(cgroup); return NULL; diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index fa1778aee5d6..64336a280967 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -344,7 +344,7 @@ void numa_topology__delete(struct numa_topology *tp) u32 i; for (i = 0; i < tp->nr; i++) - free(tp->nodes[i].cpus); + zfree(&tp->nodes[i].cpus); free(tp); } diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index d92516edbead..508e4a3ddc8c 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -555,8 +555,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq) etmq->traceid_queues_list = NULL; /* finally free the traceid_queues array */ - free(etmq->traceid_queues); - etmq->traceid_queues = NULL; + zfree(&etmq->traceid_queues); } static void cs_etm__free_queue(void *priv) @@ -2569,7 +2568,7 @@ err_free_etm: err_free_metadata: /* No need to check @metadata[j], free(NULL) is supported */ for (j = 0; j < num_cpu; j++) - free(metadata[j]); + zfree(&metadata[j]); zfree(&metadata); err_free_traceid_list: intlist__delete(traceid_list); diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 1e93f2e94c40..ddbcd59f2d9b 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1353,7 +1353,7 @@ static void free_streams(struct ctf_writer *cw) for (cpu = 0; cpu < cw->stream_cnt; cpu++) ctf_stream__delete(cw->stream[cpu]); - free(cw->stream); + zfree(&cw->stream); } static int ctf_writer__setup_env(struct ctf_writer *cw, diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index df7e000e19ea..1d1b97a92c3f 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -21,7 +21,7 @@ static void close_dir(struct perf_data_file *files, int nr) { while (--nr >= 1) { close(files[nr].fd); - free(files[nr].path); + zfree(&files[nr].path); } free(files); } diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index f92d992bd2db..9909ec40c6d2 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -187,7 +187,7 @@ void perf_env__exit(struct perf_env *env) zfree(&env->caches); for (i = 0; i < env->nr_memory_nodes; i++) - free(env->memory_nodes[i].set); + zfree(&env->memory_nodes[i].set); zfree(&env->memory_nodes); } @@ -287,9 +287,9 @@ int perf_env__nr_cpus_avail(struct perf_env *env) void cpu_cache_level__free(struct cpu_cache_level *cache) { - free(cache->type); - free(cache->map); - free(cache->size); + zfree(&cache->type); + zfree(&cache->map); + zfree(&cache->size); } /* diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 7524bda5140b..f1f4848947ce 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -856,7 +856,7 @@ free_threads: free(synthesize_threads); free_dirent: for (i = 0; i < n; i++) - free(dirent[i]); + zfree(&dirent[i]); free(dirent); return err; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4e2efaa50c2f..c24db7f4909c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1052,7 +1052,7 @@ static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 lev scnprintf(file, PATH_MAX, "%s/size", path); if (sysfs__read_str(file, &cache->size, &len)) { - free(cache->type); + zfree(&cache->type); return -1; } @@ -1061,8 +1061,8 @@ static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 lev scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path); if (sysfs__read_str(file, &cache->map, &len)) { - free(cache->map); - free(cache->type); + zfree(&cache->map); + zfree(&cache->type); return -1; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index bb1d77331add..9b0ee0ef0f44 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -472,16 +472,16 @@ static int hist_entry__init(struct hist_entry *he, return 0; err_srcline: - free(he->srcline); + zfree(&he->srcline); err_rawdata: - free(he->raw_data); + zfree(&he->raw_data); err_infos: if (he->branch_info) { map__put(he->branch_info->from.map); map__put(he->branch_info->to.map); - free(he->branch_info); + zfree(&he->branch_info); } if (he->mem_info) { map__put(he->mem_info->iaddr.map); @@ -489,7 +489,7 @@ err_infos: } err: map__zput(he->ms.map); - free(he->stat_acc); + zfree(&he->stat_acc); return -ENOMEM; } @@ -1254,10 +1254,10 @@ void hist_entry__delete(struct hist_entry *he) zfree(&he->stat_acc); free_srcline(he->srcline); if (he->srcfile && he->srcfile[0]) - free(he->srcfile); + zfree(&he->srcfile); free_callchain(he->callchain); - free(he->trace_output); - free(he->raw_data); + zfree(&he->trace_output); + zfree(&he->raw_data); ops->free(he); } diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 28908afedec4..18c34f0c1966 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -29,6 +29,7 @@ #include "../builtin.h" #include +#include struct jit_buf_desc { struct perf_data *output; @@ -431,14 +432,12 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) jd->unwinding_data, jd->eh_frame_hdr_size, jd->unwinding_size); if (jd->debug_data && jd->nr_debug_entries) { - free(jd->debug_data); - jd->debug_data = NULL; + zfree(&jd->debug_data); jd->nr_debug_entries = 0; } if (jd->unwinding_data && jd->eh_frame_hdr_size) { - free(jd->unwinding_data); - jd->unwinding_data = NULL; + zfree(&jd->unwinding_data); jd->eh_frame_hdr_size = 0; jd->unwinding_mapped_size = 0; jd->unwinding_size = 0; diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index b9fddb809d58..9f0470ecbca9 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -353,8 +353,7 @@ void llvm__get_kbuild_opts(char **kbuild_dir, char **kbuild_include_opts) " \toption in [llvm] to \"\" to suppress this detection.\n\n", *kbuild_dir); - free(*kbuild_dir); - *kbuild_dir = NULL; + zfree(kbuild_dir); goto errout; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f523da3009e4..cf826eca3aaf 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -810,7 +810,7 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start, out: /* put the dso here, corresponding to machine__findnew_module_dso */ dso__put(dso); - free(m.name); + zfree(&m.name); return map; } @@ -1350,7 +1350,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, if (m.kmod) ret = map_groups__set_module_path(mg, path, &m); - free(m.name); + zfree(&m.name); if (ret) goto out; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index d8164574cb16..0d8c840f88c0 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -18,6 +18,7 @@ #include "strlist.h" #include #include +#include struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct perf_evsel *evsel, @@ -235,7 +236,7 @@ static struct rb_node *mep_new(struct rblist *rl __maybe_unused, goto out_name; return &me->nd; out_name: - free((char *)me->name); + zfree(&me->name); out_me: free(me); return NULL; @@ -263,7 +264,7 @@ static void mep_delete(struct rblist *rl __maybe_unused, struct mep *me = container_of(nd, struct mep, nd); strlist__delete(me->metrics); - free((void *)me->name); + zfree(&me->name); free(me); } @@ -489,8 +490,8 @@ static void metricgroup__free_egroups(struct list_head *group_list) list_for_each_entry_safe (eg, egtmp, group_list, nd) { for (i = 0; i < eg->idnum; i++) - free((char *)eg->ids[i]); - free(eg->ids); + zfree(&eg->ids[i]); + zfree(&eg->ids); free(eg); } } diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 80c0eca0f1ee..0a57b316c4dd 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -214,9 +214,9 @@ out: static void clear_perf_probe_point(struct perf_probe_point *pp) { - free(pp->file); - free(pp->function); - free(pp->lazy_line); + zfree(&pp->file); + zfree(&pp->function); + zfree(&pp->lazy_line); } static void clear_probe_trace_events(struct probe_trace_event *tevs, int ntevs) @@ -1175,12 +1175,11 @@ int show_available_vars(struct perf_probe_event *pevs __maybe_unused, void line_range__clear(struct line_range *lr) { - free(lr->function); - free(lr->file); - free(lr->path); - free(lr->comp_dir); + zfree(&lr->function); + zfree(&lr->file); + zfree(&lr->path); + zfree(&lr->comp_dir); intlist__delete(lr->line_list); - memset(lr, 0, sizeof(*lr)); } int line_range__init(struct line_range *lr) @@ -2203,15 +2202,15 @@ void clear_perf_probe_event(struct perf_probe_event *pev) struct perf_probe_arg_field *field, *next; int i; - free(pev->event); - free(pev->group); - free(pev->target); + zfree(&pev->event); + zfree(&pev->group); + zfree(&pev->target); clear_perf_probe_point(&pev->point); for (i = 0; i < pev->nargs; i++) { - free(pev->args[i].name); - free(pev->args[i].var); - free(pev->args[i].type); + zfree(&pev->args[i].name); + zfree(&pev->args[i].var); + zfree(&pev->args[i].type); field = pev->args[i].field; while (field) { next = field->next; @@ -2220,8 +2219,7 @@ void clear_perf_probe_event(struct perf_probe_event *pev) field = next; } } - free(pev->args); - memset(pev, 0, sizeof(*pev)); + zfree(&pev->args); } #define strdup_or_goto(str, label) \ @@ -2302,15 +2300,15 @@ void clear_probe_trace_event(struct probe_trace_event *tev) struct probe_trace_arg_ref *ref, *next; int i; - free(tev->event); - free(tev->group); - free(tev->point.symbol); - free(tev->point.realname); - free(tev->point.module); + zfree(&tev->event); + zfree(&tev->group); + zfree(&tev->point.symbol); + zfree(&tev->point.realname); + zfree(&tev->point.module); for (i = 0; i < tev->nargs; i++) { - free(tev->args[i].name); - free(tev->args[i].value); - free(tev->args[i].type); + zfree(&tev->args[i].name); + zfree(&tev->args[i].value); + zfree(&tev->args[i].type); ref = tev->args[i].ref; while (ref) { next = ref->next; @@ -2318,8 +2316,7 @@ void clear_probe_trace_event(struct probe_trace_event *tev) ref = next; } } - free(tev->args); - memset(tev, 0, sizeof(*tev)); + zfree(&tev->args); } struct kprobe_blacklist_node { @@ -2337,7 +2334,7 @@ static void kprobe_blacklist__delete(struct list_head *blacklist) node = list_first_entry(blacklist, struct kprobe_blacklist_node, list); list_del(&node->list); - free(node->symbol); + zfree(&node->symbol); free(node); } } diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index ea669702825d..cca9cb851d02 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -1044,7 +1044,7 @@ static void s390_cpumsf_free(struct perf_session *session) auxtrace_heap__free(&sf->heap); s390_cpumsf_free_queues(session); session->auxtrace = NULL; - free(sf->logdir); + zfree(&sf->logdir); free(sf); } @@ -1101,8 +1101,7 @@ static int s390_cpumsf__config(const char *var, const char *value, void *cb) if (rc == -1 || !S_ISDIR(stbuf.st_mode)) { pr_err("Missing auxtrace log directory %s," " continue with current directory...\n", value); - free(sf->logdir); - sf->logdir = NULL; + zfree(&sf->logdir); } return 1; } @@ -1162,7 +1161,7 @@ err_free_queues: auxtrace_queues__free(&sf->queues); session->auxtrace = NULL; err_free: - free(sf->logdir); + zfree(&sf->logdir); free(sf); return err; } diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c index 684b155c222a..688a85a3d454 100644 --- a/tools/perf/util/srccode.c +++ b/tools/perf/util/srccode.c @@ -4,7 +4,8 @@ * Copyright (c) 2017, Intel Corporation. * Author: Andi Kleen */ -#include "linux/list.h" +#include +#include #include #include #include @@ -86,8 +87,8 @@ static void free_srcfile(struct srcfile *sf) hlist_del(&sf->hash_nd); map_total_sz -= sf->maplen; munmap(sf->map, sf->maplen); - free(sf->lines); - free(sf->fn); + zfree(&sf->lines); + zfree(&sf->fn); free(sf); num_srcfiles--; } @@ -153,7 +154,7 @@ static struct srcfile *find_srcfile(char *fn) out_map: munmap(h->map, sz); out_fn: - free(h->fn); + zfree(&h->fn); out_h: free(h); return NULL; diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index cb891e5c2969..656065af4971 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -8,6 +8,7 @@ #include "evlist.h" #include "expr.h" #include "metricgroup.h" +#include /* * AGGR_GLOBAL: Use CPU 0 @@ -775,7 +776,7 @@ static void generic_metric(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "", 0); for (i = 1; i < pctx.num_ids; i++) - free((void *)pctx.ids[i].name); + zfree(&pctx.ids[i].name); } void perf_stat__print_shadow_stats(struct perf_stat_config *config, diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index c967715c1d4c..db8a6cf336be 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -133,7 +133,7 @@ static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) struct perf_stat_evsel *ps = evsel->stats; if (ps) - free(ps->group_data); + zfree(&ps->group_data); zfree(&evsel->stats); } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index ad683fbe9678..1d5447594f5d 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -2133,11 +2133,11 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len, return 0; out_free_args: - free(tmp->args); + zfree(&tmp->args); out_free_name: - free(tmp->name); + zfree(&tmp->name); out_free_prov: - free(tmp->provider); + zfree(&tmp->provider); out_free_note: free(tmp); out_err: @@ -2253,8 +2253,8 @@ int cleanup_sdt_note_list(struct list_head *sdt_notes) list_for_each_entry_safe(pos, tmp, sdt_notes, note_list) { list_del(&pos->note_list); - free(pos->name); - free(pos->provider); + zfree(&pos->name); + zfree(&pos->provider); free(pos); nr_free++; } diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index c291874352cf..5b3511f2b6b1 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -480,7 +480,7 @@ int thread_map__remove(struct thread_map *threads, int idx) /* * Free the 'idx' item and shift the rest up. */ - free(threads->map[idx].comm); + zfree(&threads->map[idx].comm); for (i = idx; i < threads->nr - 1; i++) threads->map[i] = threads->map[i + 1]; diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 25e1406b1f8b..71a788921b62 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -25,6 +25,7 @@ #include #include #include +#include #ifndef REMOTE_UNWIND_LIBUNWIND #include #include @@ -345,7 +346,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso, __func__, dso->symsrc_filename, debuglink); - free(dso->symsrc_filename); + zfree(&dso->symsrc_filename); } dso->symsrc_filename = debuglink; } else { -- cgit v1.2.3 From e56fbc9dc79ce0fdc49ffadd062214ddd02f65b6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 4 Jul 2019 12:13:46 -0300 Subject: perf tools: Use list_del_init() more thorougly To allow for destructors to check if they're operating on a object still in a list, and to avoid going from use after free list entries into still valid, or even also other already removed from list entries. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-deh17ub44atyox3j90e6rksu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 2 +- tools/perf/builtin-lock.c | 8 ++++---- tools/perf/pmu-events/jevents.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/ui/gtk/annotate.c | 2 +- tools/perf/util/annotate.c | 4 ++-- tools/perf/util/auxtrace.c | 4 ++-- tools/perf/util/bpf-loader.c | 2 +- tools/perf/util/call-path.c | 2 +- tools/perf/util/callchain.c | 10 +++++----- tools/perf/util/db-export.c | 4 ++-- tools/perf/util/dso.c | 2 +- tools/perf/util/evsel.c | 2 +- tools/perf/util/hist.c | 4 ++-- tools/perf/util/ordered-events.c | 6 +++--- tools/perf/util/parse-events.c | 2 +- tools/perf/util/pmu.c | 2 +- tools/perf/util/probe-event.c | 2 +- tools/perf/util/s390-cpumsf.c | 2 +- tools/perf/util/srccode.c | 2 +- tools/perf/util/symbol-elf.c | 6 +++--- tools/perf/util/thread.c | 4 ++-- 22 files changed, 38 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 9c228c55e1fb..66d5a6658daf 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -431,7 +431,7 @@ static void delete_filter_func(struct list_head *head) struct filter_entry *pos, *tmp; list_for_each_entry_safe(pos, tmp, head, list) { - list_del(&pos->list); + list_del_init(&pos->list); free(pos); } } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index c0be44e65e9d..574e30ec6d7c 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -454,7 +454,7 @@ broken: /* broken lock sequence, discard it */ ls->discard = 1; bad_hist[BROKEN_ACQUIRE]++; - list_del(&seq->list); + list_del_init(&seq->list); free(seq); goto end; default: @@ -515,7 +515,7 @@ static int report_lock_acquired_event(struct perf_evsel *evsel, /* broken lock sequence, discard it */ ls->discard = 1; bad_hist[BROKEN_ACQUIRED]++; - list_del(&seq->list); + list_del_init(&seq->list); free(seq); goto end; default: @@ -570,7 +570,7 @@ static int report_lock_contended_event(struct perf_evsel *evsel, /* broken lock sequence, discard it */ ls->discard = 1; bad_hist[BROKEN_CONTENDED]++; - list_del(&seq->list); + list_del_init(&seq->list); free(seq); goto end; default: @@ -639,7 +639,7 @@ static int report_lock_release_event(struct perf_evsel *evsel, ls->nr_release++; free_seq: - list_del(&seq->list); + list_del_init(&seq->list); free(seq); end: return 0; diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 287a6f10ca48..1a91a197cafb 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -407,7 +407,7 @@ static void free_arch_std_events(void) list_for_each_entry_safe(es, next, &arch_std_events, list) { FOR_ALL_EVENT_STRUCT_FIELDS(FREE_EVENT_FIELD); - list_del(&es->list); + list_del_init(&es->list); free(es); } } diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 744409dce65f..6cdab5f4812a 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -238,7 +238,7 @@ static void free_event_nodes(struct list_head *events) while (!list_empty(events)) { node = list_entry(events->next, struct event_node, list); - list_del(&node->list); + list_del_init(&node->list); free(node); } } diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index df49c9ba1785..3af87c18a914 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -152,7 +152,7 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct symbol *sym, gtk_container_add(GTK_CONTAINER(window), view); list_for_each_entry_safe(pos, n, ¬es->src->source, al.node) { - list_del(&pos->al.node); + list_del_init(&pos->al.node); disasm_line__free(pos); } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ef0e6028684c..ac9ad2330f93 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1586,7 +1586,7 @@ static void delete_last_nop(struct symbol *sym) return; } - list_del(&dl->al.node); + list_del_init(&dl->al.node); disasm_line__free(dl); } } @@ -2463,7 +2463,7 @@ void annotated_source__purge(struct annotated_source *as) struct annotation_line *al, *n; list_for_each_entry_safe(al, n, &as->source, node) { - list_del(&al->node); + list_del_init(&al->node); disasm_line__free(disasm_line(al)); } } diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index b033a43dfe3b..ec0af36697c4 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -408,7 +408,7 @@ void auxtrace_queues__free(struct auxtrace_queues *queues) buffer = list_entry(queues->queue_array[i].head.next, struct auxtrace_buffer, list); - list_del(&buffer->list); + list_del_init(&buffer->list); auxtrace_buffer__free(buffer); } } @@ -612,7 +612,7 @@ void auxtrace_index__free(struct list_head *head) struct auxtrace_index *auxtrace_index, *n; list_for_each_entry_safe(auxtrace_index, n, head, list) { - list_del(&auxtrace_index->list); + list_del_init(&auxtrace_index->list); free(auxtrace_index); } } diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 93d0f239ad4f..c61974a50aa5 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -829,7 +829,7 @@ static void bpf_map_op__delete(struct bpf_map_op *op) { if (!list_empty(&op->list)) - list_del(&op->list); + list_del_init(&op->list); if (op->key_type == BPF_MAP_KEY_RANGES) parse_events__clear_array(&op->k.array); free(op); diff --git a/tools/perf/util/call-path.c b/tools/perf/util/call-path.c index e8a80c41cba3..5c60b8be1cf6 100644 --- a/tools/perf/util/call-path.c +++ b/tools/perf/util/call-path.c @@ -40,7 +40,7 @@ void call_path_root__free(struct call_path_root *cpr) struct call_path_block *pos, *n; list_for_each_entry_safe(pos, n, &cpr->blocks, node) { - list_del(&pos->node); + list_del_init(&pos->node); free(pos); } free(cpr); diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index b4af25dca5eb..8d7d8f62fcca 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -636,7 +636,7 @@ add_child(struct callchain_node *parent, struct callchain_list *call, *tmp; list_for_each_entry_safe(call, tmp, &new->val, list) { - list_del(&call->list); + list_del_init(&call->list); map__zput(call->ms.map); free(call); } @@ -1002,7 +1002,7 @@ merge_chain_branch(struct callchain_cursor *cursor, callchain_cursor_append(cursor, list->ip, list->ms.map, list->ms.sym, false, NULL, 0, 0, 0, list->srcline); - list_del(&list->list); + list_del_init(&list->list); map__zput(list->ms.map); free(list); } @@ -1453,13 +1453,13 @@ static void free_callchain_node(struct callchain_node *node) struct rb_node *n; list_for_each_entry_safe(list, tmp, &node->parent_val, list) { - list_del(&list->list); + list_del_init(&list->list); map__zput(list->ms.map); free(list); } list_for_each_entry_safe(list, tmp, &node->val, list) { - list_del(&list->list); + list_del_init(&list->list); map__zput(list->ms.map); free(list); } @@ -1544,7 +1544,7 @@ int callchain_node__make_parent_list(struct callchain_node *node) out: list_for_each_entry_safe(chain, new, &head, list) { - list_del(&chain->list); + list_del_init(&chain->list); map__zput(chain->ms.map); free(chain); } diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 3f2694ccfac7..2394c7506abe 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -34,7 +34,7 @@ static int db_export__deferred(struct db_export *dbe) de = list_entry(dbe->deferred.next, struct deferred_export, node); err = dbe->export_comm(dbe, de->comm); - list_del(&de->node); + list_del_init(&de->node); free(de); if (err) return err; @@ -50,7 +50,7 @@ static void db_export__free_deferred(struct db_export *dbe) while (!list_empty(&dbe->deferred)) { de = list_entry(dbe->deferred.next, struct deferred_export, node); - list_del(&de->node); + list_del_init(&de->node); free(de); } } diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index ebacf07fc9ee..ebc9d46c15a7 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -434,7 +434,7 @@ static void dso__list_add(struct dso *dso) static void dso__list_del(struct dso *dso) { - list_del(&dso->data.open_entry); + list_del_init(&dso->data.open_entry); WARN_ONCE(dso__data_open_cnt <= 0, "DSO data fd counter out of bounds."); dso__data_open_cnt--; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 7ede674edf07..ebb46da4dfe5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1298,7 +1298,7 @@ static void perf_evsel__free_config_terms(struct perf_evsel *evsel) struct perf_evsel_config_term *term, *h; list_for_each_entry_safe(term, h, &evsel->config_terms, list) { - list_del(&term->list); + list_del_init(&term->list); free(term); } } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9b0ee0ef0f44..f24fd1954f6c 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2741,10 +2741,10 @@ static void hists_evsel__exit(struct perf_evsel *evsel) list_for_each_entry_safe(node, tmp, &hists->hpp_formats, list) { perf_hpp_list__for_each_format_safe(&node->hpp, fmt, pos) { - list_del(&fmt->list); + list_del_init(&fmt->list); free(fmt); } - list_del(&node->list); + list_del_init(&node->list); free(node); } } diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index 989fed6f43b5..bb5f34b7ab44 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -138,7 +138,7 @@ static struct ordered_event *alloc_event(struct ordered_events *oe, if (!list_empty(cache)) { new = list_entry(cache->next, struct ordered_event, list); - list_del(&new->list); + list_del_init(&new->list); } else if (oe->buffer) { new = &oe->buffer->event[oe->buffer_idx]; if (++oe->buffer_idx == MAX_SAMPLE_BUFFER) @@ -394,13 +394,13 @@ void ordered_events__free(struct ordered_events *oe) * yet, we need to free only allocated ones ... */ if (oe->buffer) { - list_del(&oe->buffer->list); + list_del_init(&oe->buffer->list); ordered_events_buffer__free(oe->buffer, oe->buffer_idx, oe); } /* ... and continue with the rest */ list_for_each_entry_safe(buffer, tmp, &oe->to_free, list) { - list_del(&buffer->list); + list_del_init(&buffer->list); ordered_events_buffer__free(buffer, MAX_SAMPLE_BUFFER, oe); } } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index aa439853f20a..371ff3aee769 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -652,7 +652,7 @@ static int add_bpf_event(const char *group, const char *event, int fd, pr_debug("Failed to add BPF event %s:%s\n", group, event); list_for_each_entry_safe(evsel, tmp, &new_evsels, node) { - list_del(&evsel->node); + list_del_init(&evsel->node); perf_evsel__delete(evsel); } return err; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 12b677902fbc..f32b710347db 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1245,7 +1245,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, info->metric_expr = alias->metric_expr; info->metric_name = alias->metric_name; - list_del(&term->list); + list_del_init(&term->list); free(term); } diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 0a57b316c4dd..0c3b55d0617d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2333,7 +2333,7 @@ static void kprobe_blacklist__delete(struct list_head *blacklist) while (!list_empty(blacklist)) { node = list_first_entry(blacklist, struct kprobe_blacklist_node, list); - list_del(&node->list); + list_del_init(&node->list); zfree(&node->symbol); free(node); } diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index cca9cb851d02..83d2e149ef19 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -756,7 +756,7 @@ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq, */ if (err) { sfq->buffer = NULL; - list_del(&buffer->list); + list_del_init(&buffer->list); auxtrace_buffer__free(buffer); if (err > 0) /* Buffer done, no error */ err = 0; diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c index 688a85a3d454..adfcf1ff464c 100644 --- a/tools/perf/util/srccode.c +++ b/tools/perf/util/srccode.c @@ -83,7 +83,7 @@ static void fill_lines(char **lines, int maxline, char *map, int maplen) static void free_srcfile(struct srcfile *sf) { - list_del(&sf->nd); + list_del_init(&sf->nd); hlist_del(&sf->hash_nd); map_total_sz -= sf->maplen; munmap(sf->map, sf->maplen); diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 1d5447594f5d..7d504dc22108 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1478,7 +1478,7 @@ static void kcore_copy__free_phdrs(struct kcore_copy_info *kci) struct phdr_data *p, *tmp; list_for_each_entry_safe(p, tmp, &kci->phdrs, node) { - list_del(&p->node); + list_del_init(&p->node); free(p); } } @@ -1501,7 +1501,7 @@ static void kcore_copy__free_syms(struct kcore_copy_info *kci) struct sym_data *s, *tmp; list_for_each_entry_safe(s, tmp, &kci->syms, node) { - list_del(&s->node); + list_del_init(&s->node); free(s); } } @@ -2252,7 +2252,7 @@ int cleanup_sdt_note_list(struct list_head *sdt_notes) int nr_free = 0; list_for_each_entry_safe(pos, tmp, sdt_notes, note_list) { - list_del(&pos->note_list); + list_del_init(&pos->note_list); zfree(&pos->name); zfree(&pos->provider); free(pos); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index bbfb9c767f5f..873ab505ca80 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -93,14 +93,14 @@ void thread__delete(struct thread *thread) down_write(&thread->namespaces_lock); list_for_each_entry_safe(namespaces, tmp_namespaces, &thread->namespaces_list, list) { - list_del(&namespaces->list); + list_del_init(&namespaces->list); namespaces__free(namespaces); } up_write(&thread->namespaces_lock); down_write(&thread->comm_lock); list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) { - list_del(&comm->list); + list_del_init(&comm->list); comm__free(comm); } up_write(&thread->comm_lock); -- cgit v1.2.3 From acc7bfb3db9744c4a18c96fd6536069e8647cb11 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 4 Jul 2019 12:20:21 -0300 Subject: perf metricgroup: Add missing list_del_init() when flushing egroups list So that at the end each of the entries have its list node struct cleared and the egroup list head ends emptied. Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-dxzj1ah350fy9ec0xbhb15b6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 0d8c840f88c0..416a9015405e 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -492,6 +492,7 @@ static void metricgroup__free_egroups(struct list_head *group_list) for (i = 0; i < eg->idnum; i++) zfree(&eg->ids[i]); zfree(&eg->ids); + list_del_init(&eg->nd); free(eg); } } -- cgit v1.2.3 From 34c9af571e51466fbcc423fb955277c82f03ce92 Mon Sep 17 00:00:00 2001 From: Luke Mujica Date: Wed, 3 Jul 2019 15:25:08 -0700 Subject: perf parse-events: Remove unused variable 'i' Remove the 'int i' because it is declared but not used in parse-events.y or in the generated parse-events.c. Signed-off-by: Luke Mujica Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190703222509.109616-1-lukemujica@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 6ad8d4914969..172dbb73941f 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -626,7 +626,6 @@ PE_TERM PE_NAME array '=' PE_NAME { struct parse_events_term *term; - int i; ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER, $1, $4, &@1, &@4)); -- cgit v1.2.3 From 72de3fd97f15d75657dd5389e26801cbf8af0f9a Mon Sep 17 00:00:00 2001 From: Luke Mujica Date: Wed, 3 Jul 2019 15:25:09 -0700 Subject: perf parse-events: Remove unused variable: error Remove the 'error' variable because it is declared but not used in parse-events.y or in the generated parse-events.c. Signed-off-by: Luke Mujica Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190703222509.109616-2-lukemujica@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 172dbb73941f..f1c36ed1cf36 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -480,7 +480,6 @@ event_bpf_file: PE_BPF_OBJECT opt_event_config { struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; struct list_head *list; ALLOC_LIST(list); -- cgit v1.2.3 From 0702f23c983b8a921853c33a9f59b9cf0975d36e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 8 Jul 2019 22:39:37 +0800 Subject: perf cs-etm: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/util/cs-etm.c:2545 cs_etm__process_auxtrace_info() error: we previously assumed 'session->itrace_synth_opts' could be null (see line 2541) tools/perf/util/cs-etm.c 2541 if (session->itrace_synth_opts && session->itrace_synth_opts->set) { 2542 etm->synth_opts = *session->itrace_synth_opts; 2543 } else { 2544 itrace_synth_opts__set_default(&etm->synth_opts, 2545 session->itrace_synth_opts->default_no_sample); ^^^^^^^^^^^^^^^^^^^^^^^^^^ 2546 etm->synth_opts.callchain = false; 2547 } 'session->itrace_synth_opts' is impossible to be a NULL pointer in cs_etm__process_auxtrace_info(), thus this patch removes the NULL test for 'session->itrace_synth_opts'. Signed-off-by: Leo Yan Reviewed-by: Mathieu Poirier Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190708143937.7722-5-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 508e4a3ddc8c..67b88b599a53 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -2538,7 +2538,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event, return 0; } - if (session->itrace_synth_opts && session->itrace_synth_opts->set) { + if (session->itrace_synth_opts->set) { etm->synth_opts = *session->itrace_synth_opts; } else { itrace_synth_opts__set_default(&etm->synth_opts, -- cgit v1.2.3 From ceb75476db1617a88cc29b09839acacb69aa076e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 8 Jul 2019 22:39:34 +0800 Subject: perf hists browser: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/ui/browsers/hists.c:641 hist_browser__run() error: we previously assumed 'hbt' could be null (see line 625) tools/perf/ui/browsers/hists.c:3088 perf_evsel__hists_browse() error: we previously assumed 'browser->he_selection' could be null (see line 2902) tools/perf/ui/browsers/hists.c:3272 perf_evsel_menu__run() error: we previously assumed 'hbt' could be null (see line 3260) This patch firstly validating the pointers before access them, so can fix potential NULL pointer dereference. Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190708143937.7722-2-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 85581cfb9112..a94eb0755e8b 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -639,7 +639,11 @@ int hist_browser__run(struct hist_browser *browser, const char *help, switch (key) { case K_TIMER: { u64 nr_entries; - hbt->timer(hbt->arg); + + WARN_ON_ONCE(!hbt); + + if (hbt) + hbt->timer(hbt->arg); if (hist_browser__has_filter(browser) || symbol_conf.report_hierarchy) @@ -2821,7 +2825,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, { struct hists *hists = evsel__hists(evsel); struct hist_browser *browser = perf_evsel_browser__new(evsel, hbt, env, annotation_opts); - struct branch_info *bi; + struct branch_info *bi = NULL; #define MAX_OPTIONS 16 char *options[MAX_OPTIONS]; struct popup_action actions[MAX_OPTIONS]; @@ -3087,7 +3091,9 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, goto skip_annotation; if (sort__mode == SORT_MODE__BRANCH) { - bi = browser->he_selection->branch_info; + + if (browser->he_selection) + bi = browser->he_selection->branch_info; if (bi == NULL) goto skip_annotation; @@ -3271,7 +3277,8 @@ static int perf_evsel_menu__run(struct perf_evsel_menu *menu, switch (key) { case K_TIMER: - hbt->timer(hbt->arg); + if (hbt) + hbt->timer(hbt->arg); if (!menu->lost_events_warned && menu->lost_events && -- cgit v1.2.3 From d8d051df9f906232715282cc0570c94273b197bc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 8 Jul 2019 08:52:31 +0300 Subject: perf scripts python: export-to-postgresql.py: Fix DROP VIEW power_events_view PostgreSQL can error if power_events_view is not dropped before its dependent tables e.g. Exception: Query failed: ERROR: cannot drop table mwait because other objects depend on it DETAIL: view power_events_view depends on table mwait Signed-off-by: Adrian Hunter Cc: Jiri Olsa Fixes: aba44287a224 ("perf scripts python: export-to-postgresql.py: Export Intel PT power and ptwrite events") Link: http://lkml.kernel.org/r/20190708055232.5032-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 4447f0d7c754..92713d93e956 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -898,11 +898,11 @@ def trace_end(): if is_table_empty("ptwrite"): drop("ptwrite") if is_table_empty("mwait") and is_table_empty("pwre") and is_table_empty("exstop") and is_table_empty("pwrx"): + do_query(query, 'DROP VIEW power_events_view'); drop("mwait") drop("pwre") drop("exstop") drop("pwrx") - do_query(query, 'DROP VIEW power_events_view'); if is_table_empty("cbr"): drop("cbr") -- cgit v1.2.3 From 1334bb94cd8a21217cb0c186925f9bc9d47adafc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 8 Jul 2019 08:52:32 +0300 Subject: perf scripts python: export-to-sqlite.py: Fix DROP VIEW power_events_view Drop power_events_view before its dependent tables. SQLite does not seem to mind but the fix was needed for PostgreSQL (export-to-postgresql.py script), so do the same fix for the SQLite. It is more logical and keeps the 2 scripts following the same approach. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Fixes: 5130c6e55531 ("perf scripts python: export-to-sqlite.py: Export Intel PT power and ptwrite events") Link: http://lkml.kernel.org/r/20190708055232.5032-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-sqlite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 3222a83f4184..021326c46285 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -608,11 +608,11 @@ def trace_end(): if is_table_empty("ptwrite"): drop("ptwrite") if is_table_empty("mwait") and is_table_empty("pwre") and is_table_empty("exstop") and is_table_empty("pwrx"): + do_query(query, 'DROP VIEW power_events_view'); drop("mwait") drop("pwre") drop("exstop") drop("pwrx") - do_query(query, 'DROP VIEW power_events_view'); if is_table_empty("cbr"): drop("cbr") -- cgit v1.2.3 From 9d49169c5958e429ffa6874fbef734ae7502ad65 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 20 Jun 2019 18:44:38 -0700 Subject: perf script: Assume native_arch for pipe mode In pipe mode, session->header.env.arch is not populated until the events are processed. Therefore, the following command crashes: perf record -o - | perf script (gdb) bt It fails when we try to compare env.arch against uts.machine: if (!strcmp(uts.machine, session->header.env.arch) || (!strcmp(uts.machine, "x86_64") && !strcmp(session->header.env.arch, "i386"))) native_arch = true; In pipe mode, it is tricky to find env.arch at this stage. To keep it simple, let's just assume native_arch is always true for pipe mode. Reported-by: David Carrillo Cisneros Signed-off-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kernel-team@fb.com Cc: stable@vger.kernel.org #v5.1+ Fixes: 3ab481a1cfe1 ("perf script: Support insn output for normal samples") Link: http://lkml.kernel.org/r/20190621014438.810342-1-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b3536820f9a8..79367087bd18 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3752,7 +3752,8 @@ int cmd_script(int argc, const char **argv) goto out_delete; uname(&uts); - if (!strcmp(uts.machine, session->header.env.arch) || + if (data.is_pipe || /* assume pipe_mode indicates native_arch */ + !strcmp(uts.machine, session->header.env.arch) || (!strcmp(uts.machine, "x86_64") && !strcmp(session->header.env.arch, "i386"))) native_arch = true; -- cgit v1.2.3 From 1d481458816d9424c8a05833ce0ebe72194a350e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 8 Jul 2019 22:39:35 +0800 Subject: perf intel-bts: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/util/intel-bts.c:898 intel_bts_process_auxtrace_info() error: we previously assumed 'session->itrace_synth_opts' could be null (see line 894) tools/perf/util/intel-bts.c:899 intel_bts_process_auxtrace_info() warn: variable dereferenced before check 'session->itrace_synth_opts' (see line 898) tools/perf/util/intel-bts.c 894 if (session->itrace_synth_opts && session->itrace_synth_opts->set) { 895 bts->synth_opts = *session->itrace_synth_opts; 896 } else { 897 itrace_synth_opts__set_default(&bts->synth_opts, 898 session->itrace_synth_opts->default_no_sample); ^^^^^^^^^^^^^^^^^^^^^^^^^^ 899 if (session->itrace_synth_opts) ^^^^^^^^^^^^^^^^^^^^^^^^^^ 900 bts->synth_opts.thread_stack = 901 session->itrace_synth_opts->thread_stack; 902 } 'session->itrace_synth_opts' is impossible to be a NULL pointer in intel_bts_process_auxtrace_info(), thus this patch removes the NULL test for 'session->itrace_synth_opts'. Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190708143937.7722-3-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-bts.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 5a21bcdb8ef7..5560e95afdda 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -891,13 +891,12 @@ int intel_bts_process_auxtrace_info(union perf_event *event, if (dump_trace) return 0; - if (session->itrace_synth_opts && session->itrace_synth_opts->set) { + if (session->itrace_synth_opts->set) { bts->synth_opts = *session->itrace_synth_opts; } else { itrace_synth_opts__set_default(&bts->synth_opts, session->itrace_synth_opts->default_no_sample); - if (session->itrace_synth_opts) - bts->synth_opts.thread_stack = + bts->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; } -- cgit v1.2.3 From 323fd749821daab0f327ec86d707c4542963cdb0 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 8 Jul 2019 22:39:36 +0800 Subject: perf intel-pt: Fix potential NULL pointer dereference found by the smatch tool Based on the following report from Smatch, fix the potential NULL pointer dereference check. tools/perf/util/intel-pt.c:3200 intel_pt_process_auxtrace_info() error: we previously assumed 'session->itrace_synth_opts' could be null (see line 3196) tools/perf/util/intel-pt.c:3206 intel_pt_process_auxtrace_info() warn: variable dereferenced before check 'session->itrace_synth_opts' (see line 3200) tools/perf/util/intel-pt.c 3196 if (session->itrace_synth_opts && session->itrace_synth_opts->set) { 3197 pt->synth_opts = *session->itrace_synth_opts; 3198 } else { 3199 itrace_synth_opts__set_default(&pt->synth_opts, 3200 session->itrace_synth_opts->default_no_sample); ^^^^^^^^^^^^^^^^^^^^^^^^^^ 3201 if (!session->itrace_synth_opts->default_no_sample && 3202 !session->itrace_synth_opts->inject) { 3203 pt->synth_opts.branches = false; 3204 pt->synth_opts.callchain = true; 3205 } 3206 if (session->itrace_synth_opts) ^^^^^^^^^^^^^^^^^^^^^^^^^^ 3207 pt->synth_opts.thread_stack = 3208 session->itrace_synth_opts->thread_stack; 3209 } 'session->itrace_synth_opts' is impossible to be a NULL pointer in intel_pt_process_auxtrace_info(), thus this patch removes the NULL test for 'session->itrace_synth_opts'. Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20190708143937.7722-4-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index c76a96f777fb..df061599fef4 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -3210,7 +3210,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, goto err_delete_thread; } - if (session->itrace_synth_opts && session->itrace_synth_opts->set) { + if (session->itrace_synth_opts->set) { pt->synth_opts = *session->itrace_synth_opts; } else { itrace_synth_opts__set_default(&pt->synth_opts, @@ -3220,8 +3220,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->synth_opts.branches = false; pt->synth_opts.callchain = true; } - if (session->itrace_synth_opts) - pt->synth_opts.thread_stack = + pt->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; } @@ -3241,11 +3240,9 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->cbr2khz = tsc_freq / pt->max_non_turbo_ratio / 1000; } - if (session->itrace_synth_opts) { - err = intel_pt_setup_time_ranges(pt, session->itrace_synth_opts); - if (err) - goto err_delete_thread; - } + err = intel_pt_setup_time_ranges(pt, session->itrace_synth_opts); + if (err) + goto err_delete_thread; if (pt->synth_opts.calls) pt->branches_filter |= PERF_IP_FLAG_CALL | PERF_IP_FLAG_ASYNC | -- cgit v1.2.3 From af3366308ea4c5473dfa5d2ace66b45cd3b1aec0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Jul 2019 19:13:59 +0300 Subject: tools: PCI: Fix installation when `make tools/pci_install` The commit c9a707875053 ("tools pci: Do not delete pcitest.sh in 'make clean'") fixed a `make tools clean` issue and simultaneously brought a regression to the installation process: for script in .../tools/pci/pcitest.sh; do \ install $script .../usr/usr/bin; \ done install: cannot stat '.../tools/pci/pcitest.sh': No such file or directory Update the install commands to fix the remaining issue. Signed-off-by: Andy Shevchenko Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I Cc: Jean-Jacques Hiblot Cc: Kishon Vijay Abraham I Cc: Lorenzo Pieralisi Cc: Arnaldo Carvalho de Melo --- tools/pci/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/pci/Makefile b/tools/pci/Makefile index 6876ee4bd78c..4b95a5176355 100644 --- a/tools/pci/Makefile +++ b/tools/pci/Makefile @@ -18,7 +18,6 @@ ALL_TARGETS := pcitest ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) SCRIPTS := pcitest.sh -ALL_SCRIPTS := $(patsubst %,$(OUTPUT)%,$(SCRIPTS)) all: $(ALL_PROGRAMS) @@ -47,10 +46,10 @@ clean: install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(bindir); \ - for program in $(ALL_PROGRAMS) pcitest.sh; do \ + for program in $(ALL_PROGRAMS); do \ install $$program $(DESTDIR)$(bindir); \ done; \ - for script in $(ALL_SCRIPTS); do \ + for script in $(SCRIPTS); do \ install $$script $(DESTDIR)$(bindir); \ done -- cgit v1.2.3 From 4975223b8156c14f0537dcde1554f050fb4d29bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Jul 2019 14:49:26 -0300 Subject: perf tools: Introduce rlimit__bump_memlock() helper Just like the BPF guys did when faced with failures with map creation, etc, i.e. their solution is: tools/testing/selftests/bpf/bpf_rlimit.h For perf use this function in 'perf test' and in 'perf trace'. Make it bump to 4 times the current value, if it fails twice the current value and if it still fails, warn that things like BPF map creation may fail, to help in diagnosing the problem. Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-muvqef2i7n6pzqbmu7tn2d2y@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/rlimit.c | 29 +++++++++++++++++++++++++++++ tools/perf/util/rlimit.h | 6 ++++++ 3 files changed, 36 insertions(+) create mode 100644 tools/perf/util/rlimit.c create mode 100644 tools/perf/util/rlimit.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index d7e3b008a613..14f812bb07a7 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -20,6 +20,7 @@ perf-y += parse-events.o perf-y += perf_regs.o perf-y += path.o perf-y += print_binary.o +perf-y += rlimit.o perf-y += argv_split.o perf-y += rbtree.o perf-y += libstring.o diff --git a/tools/perf/util/rlimit.c b/tools/perf/util/rlimit.c new file mode 100644 index 000000000000..13521d392a22 --- /dev/null +++ b/tools/perf/util/rlimit.c @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ + +#include "util/debug.h" +#include "util/rlimit.h" +#include +#include + +/* + * Bump the memlock so that we can get bpf maps of a reasonable size, + * like the ones used with 'perf trace' and with 'perf test bpf', + * improve this to some specific request if needed. + */ +void rlimit__bump_memlock(void) +{ + struct rlimit rlim; + + if (getrlimit(RLIMIT_MEMLOCK, &rlim) == 0) { + rlim.rlim_cur *= 4; + rlim.rlim_max *= 4; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { + rlim.rlim_cur /= 2; + rlim.rlim_max /= 2; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) + pr_debug("Couldn't bump rlimit(MEMLOCK), failures may take place when creating BPF maps, etc\n"); + } + } +} diff --git a/tools/perf/util/rlimit.h b/tools/perf/util/rlimit.h new file mode 100644 index 000000000000..9f59d8e710a3 --- /dev/null +++ b/tools/perf/util/rlimit.h @@ -0,0 +1,6 @@ +#ifndef __PERF_RLIMIT_H_ +#define __PERF_RLIMIT_H_ +/* SPDX-License-Identifier: LGPL-2.1 */ + +void rlimit__bump_memlock(void); +#endif // __PERF_RLIMIT_H_ -- cgit v1.2.3 From d3280ce01e21a827daa50b0e9bdc8588c6f855d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Jul 2019 16:27:01 -0300 Subject: perf test: Auto bump rlimit(MEMLOCK) for BPF test sake I noticed that the 'perf test bpf' was failing: # perf test bpf 41: BPF filter : 41.1: Basic BPF filtering : Skip 41.2: BPF pinning : Skip 41.3: BPF prologue generation : Skip 41.4: BPF relocation checker : Skip # ulimit -l 64 # Using verbose mode we get just a line bout -EPERF being returned from libbpf's bpf_load_program_xattr(), that ends up being used in 'perf test bpf' initial program loading capability query: Missing basic BPF support, skip this test: Operation not permitted Not that informative, but on a separate problem when creating BPF maps bumping rlimit(MEMLOCK) helped, so I tried it here as well, works: # ulimit -l 128 # perf test bpf 41: BPF filter : 41.1: Basic BPF filtering : Ok 41.2: BPF pinning : Ok 41.3: BPF prologue generation : Ok 41.4: BPF relocation checker : Ok # So use the recently added rlimit__bump_memlock() helper: # ulimit -l 64 # perf test bpf 41: BPF filter : 41.1: Basic BPF filtering : Ok 41.2: BPF pinning : Ok 41.3: BPF prologue generation : Ok 41.4: BPF relocation checker : Ok # ulimit -l 64 # I.e. the bumping of memlock is restricted to the 'perf test' instance, not changing the global value. Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-b9fubkhr4jm192lu7y8hgjvo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 66a82badc1d1..c3bec9d2c201 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -21,6 +21,7 @@ #include #include "string2.h" #include "symbol.h" +#include "util/rlimit.h" #include #include #include @@ -727,6 +728,11 @@ int cmd_test(int argc, const char **argv) if (skip != NULL) skiplist = intlist__new(skip); + /* + * Tests that create BPF maps, for instance, need more than the 64K + * default: + */ + rlimit__bump_memlock(); return __cmd_test(argc, argv, skiplist); } -- cgit v1.2.3 From c3e78a3403dabcb7115c2fb7b538a1095d168cd5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Jul 2019 16:36:45 -0300 Subject: perf trace: Auto bump rlimit(MEMLOCK) for eBPF maps sake Circa v5.2 this started to fail: # perf trace -e /wb/augmented_raw_syscalls.o event syntax error: '/wb/augmented_raw_syscalls.o' \___ Operation not permitted (add -v to see detail) Run 'perf list' for a list of valid events Usage: perf trace [] [] or: perf trace [] -- [] or: perf trace record [] [] or: perf trace record [] -- [] -e, --event event/syscall selector. use 'perf list' to list available events # In verbose mode we some -EPERM when creating a BPF map: # perf trace -v -e /wb/augmented_raw_syscalls.o libbpf: failed to create map (name: '__augmented_syscalls__'): Operation not permitted libbpf: failed to load object '/wb/augmented_raw_syscalls.o' bpf: load objects failed: err=-1: (Operation not permitted) event syntax error: '/wb/augmented_raw_syscalls.o' \___ Operation not permitted (add -v to see detail) Run 'perf list' for a list of valid events Usage: perf trace [] [] or: perf trace [] -- [] or: perf trace record [] [] or: perf trace record [] -- [] -e, --event event/syscall selector. use 'perf list' to list available events # If we bumped 'ulimit -l 128' to get it from the 64k default to double that, it worked, so use the recently added rlimit__bump_memlock() helper: # perf trace -e /wb/augmented_raw_syscalls.o -e open*,*sleep sleep 1 0.000 ( 0.007 ms): sleep/28042 openat(dfd: CWD, filename: "/etc/ld.so.cache", flags: RDONLY|CLOEXEC) = 3 0.022 ( 0.004 ms): sleep/28042 openat(dfd: CWD, filename: "/lib64/libc.so.6", flags: RDONLY|CLOEXEC) = 3 0.201 ( 0.007 ms): sleep/28042 openat(dfd: CWD, filename: "", flags: RDONLY|CLOEXEC) = 3 0.241 (1000.421 ms): sleep/28042 nanosleep(rqtp: 0x7ffd6c3e6ed0) = 0 # Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-j6f2ioa6hj9dinzpjvlhcjoc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 1aa2ed096f65..4f0bbffee05f 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -19,6 +19,7 @@ #include #include #include "util/bpf_map.h" +#include "util/rlimit.h" #include "builtin.h" #include "util/cgroup.h" #include "util/color.h" @@ -3864,6 +3865,15 @@ int cmd_trace(int argc, const char **argv) goto out; } + /* + * Parsing .perfconfig may entail creating a BPF event, that may need + * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting + * is too small. This affects just this process, not touching the + * global setting. If it fails we'll get something in 'perf trace -v' + * to help diagnose the problem. + */ + rlimit__bump_memlock(); + err = perf_config(trace__config, &trace); if (err) goto out; -- cgit v1.2.3 From fead24e52383c3f8eb25b5426d52b430b84a8194 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:50 +0300 Subject: perf db-export: Get rid of db_export__deferred() db_export__deferred() deferred the export of comms if the comm string had not been "set" (changed from :) however that problem was fixed a long time ago by commit e803cf97a4f9 ("perf record: Synthesize COMM event for a command line workload"), so get rid of db_export__deferred(). Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20190710085810.1650-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 61 +--------------------- tools/perf/util/db-export.h | 2 - .../util/scripting-engines/trace-event-python.c | 4 +- 3 files changed, 2 insertions(+), 65 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 2394c7506abe..34cf197fe74f 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -20,70 +20,14 @@ #include "db-export.h" #include -struct deferred_export { - struct list_head node; - struct comm *comm; -}; - -static int db_export__deferred(struct db_export *dbe) -{ - struct deferred_export *de; - int err; - - while (!list_empty(&dbe->deferred)) { - de = list_entry(dbe->deferred.next, struct deferred_export, - node); - err = dbe->export_comm(dbe, de->comm); - list_del_init(&de->node); - free(de); - if (err) - return err; - } - - return 0; -} - -static void db_export__free_deferred(struct db_export *dbe) -{ - struct deferred_export *de; - - while (!list_empty(&dbe->deferred)) { - de = list_entry(dbe->deferred.next, struct deferred_export, - node); - list_del_init(&de->node); - free(de); - } -} - -static int db_export__defer_comm(struct db_export *dbe, struct comm *comm) -{ - struct deferred_export *de; - - de = zalloc(sizeof(struct deferred_export)); - if (!de) - return -ENOMEM; - - de->comm = comm; - list_add_tail(&de->node, &dbe->deferred); - - return 0; -} - int db_export__init(struct db_export *dbe) { memset(dbe, 0, sizeof(struct db_export)); - INIT_LIST_HEAD(&dbe->deferred); return 0; } -int db_export__flush(struct db_export *dbe) -{ - return db_export__deferred(dbe); -} - void db_export__exit(struct db_export *dbe) { - db_export__free_deferred(dbe); call_return_processor__free(dbe->crp); dbe->crp = NULL; } @@ -172,10 +116,7 @@ int db_export__comm(struct db_export *dbe, struct comm *comm, comm->db_id = ++dbe->comm_last_db_id; if (dbe->export_comm) { - if (main_thread->comm_set) - err = dbe->export_comm(dbe, comm); - else - err = db_export__defer_comm(dbe, comm); + err = dbe->export_comm(dbe, comm); if (err) return err; } diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index e8a64028a386..261cfece8dee 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -68,11 +68,9 @@ struct db_export { u64 sample_last_db_id; u64 call_path_last_db_id; u64 call_return_last_db_id; - struct list_head deferred; }; int db_export__init(struct db_export *dbe); -int db_export__flush(struct db_export *dbe); void db_export__exit(struct db_export *dbe); int db_export__evsel(struct db_export *dbe, struct perf_evsel *evsel); int db_export__machine(struct db_export *dbe, struct machine *machine); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 112bed65232f..c9837f0f0fd6 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1620,9 +1620,7 @@ error: static int python_flush_script(void) { - struct tables *tables = &tables_global; - - return db_export__flush(&tables->dbe); + return 0; } /* -- cgit v1.2.3 From 208032fef13b68cf1eefc945dafb82efc88c6b8f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:51 +0300 Subject: perf db-export: Rename db_export__comm() to db_export__exec_comm() Rename db_export__comm() to db_export__exec_comm() to better reflect what it does and add explanatory comments. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 22 +++++++++++++++++++--- tools/perf/util/db-export.h | 4 ++-- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 34cf197fe74f..8fab57f90cbc 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -105,8 +105,14 @@ out_put: return err; } -int db_export__comm(struct db_export *dbe, struct comm *comm, - struct thread *main_thread) +/* + * Export the "exec" comm. The "exec" comm is the program / application command + * name at the time it first executes. It is used to group threads for the same + * program. Note that the main thread pid (or thread group id tgid) cannot be + * used because it does not change when a new program is exec'ed. + */ +int db_export__exec_comm(struct db_export *dbe, struct comm *comm, + struct thread *main_thread) { int err; @@ -121,6 +127,16 @@ int db_export__comm(struct db_export *dbe, struct comm *comm, return err; } + /* + * Record the main thread for this comm. Note that the main thread can + * have many "exec" comms because there will be a new one every time it + * exec's. An "exec" comm however will only ever have 1 main thread. + * That is different to any other threads for that same program because + * exec() will effectively kill them, so the relationship between the + * "exec" comm and non-main threads is 1-to-1. That is why + * db_export__comm_thread() is called here for the main thread, but it + * is called for non-main threads when they are exported. + */ return db_export__comm_thread(dbe, comm, main_thread); } @@ -313,7 +329,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, goto out_put; if (comm) { - err = db_export__comm(dbe, comm, main_thread); + err = db_export__exec_comm(dbe, comm, main_thread); if (err) goto out_put; es.comm_db_id = comm->db_id; diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 261cfece8dee..148a657b1887 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -76,8 +76,8 @@ int db_export__evsel(struct db_export *dbe, struct perf_evsel *evsel); int db_export__machine(struct db_export *dbe, struct machine *machine); int db_export__thread(struct db_export *dbe, struct thread *thread, struct machine *machine, struct comm *comm); -int db_export__comm(struct db_export *dbe, struct comm *comm, - struct thread *main_thread); +int db_export__exec_comm(struct db_export *dbe, struct comm *comm, + struct thread *main_thread); int db_export__comm_thread(struct db_export *dbe, struct comm *comm, struct thread *thread); int db_export__dso(struct db_export *dbe, struct dso *dso, -- cgit v1.2.3 From ed5c0a16feb9f1a4347f109d5e9607f6f38688a0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:52 +0300 Subject: perf db-export: Pass main_thread to db_export__thread() Calls to db_export__thread() already have main_thread so there is no reason to get it again, instead pass it as a parameter. Note that one difference in this approach is that the main thread is not created if it does not exist. It is better if it is not created because: - If main_thread is being traced it will have been created already. - If it is not being traced, there will be no other information about it, and it will never get deleted because there will be no EXIT event. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 29 ++++++++--------------------- tools/perf/util/db-export.h | 3 ++- 2 files changed, 10 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 8fab57f90cbc..14501236c046 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -59,9 +59,9 @@ int db_export__machine(struct db_export *dbe, struct machine *machine) } int db_export__thread(struct db_export *dbe, struct thread *thread, - struct machine *machine, struct comm *comm) + struct machine *machine, struct comm *comm, + struct thread *main_thread) { - struct thread *main_thread; u64 main_thread_db_id = 0; int err; @@ -70,28 +70,19 @@ int db_export__thread(struct db_export *dbe, struct thread *thread, thread->db_id = ++dbe->thread_last_db_id; - if (thread->pid_ != -1) { - if (thread->pid_ == thread->tid) { - main_thread = thread; - } else { - main_thread = machine__findnew_thread(machine, - thread->pid_, - thread->pid_); - if (!main_thread) - return -ENOMEM; + if (main_thread) { + if (main_thread != thread) { err = db_export__thread(dbe, main_thread, machine, - comm); + comm, main_thread); if (err) - goto out_put; + return err; if (comm) { err = db_export__comm_thread(dbe, comm, thread); if (err) - goto out_put; + return err; } } main_thread_db_id = main_thread->db_id; - if (main_thread != thread) - thread__put(main_thread); } if (dbe->export_thread) @@ -99,10 +90,6 @@ int db_export__thread(struct db_export *dbe, struct thread *thread, machine); return 0; - -out_put: - thread__put(main_thread); - return err; } /* @@ -324,7 +311,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, if (main_thread) comm = machine__thread_exec_comm(al->machine, main_thread); - err = db_export__thread(dbe, thread, al->machine, comm); + err = db_export__thread(dbe, thread, al->machine, comm, main_thread); if (err) goto out_put; diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 148a657b1887..6e267321594c 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -75,7 +75,8 @@ void db_export__exit(struct db_export *dbe); int db_export__evsel(struct db_export *dbe, struct perf_evsel *evsel); int db_export__machine(struct db_export *dbe, struct machine *machine); int db_export__thread(struct db_export *dbe, struct thread *thread, - struct machine *machine, struct comm *comm); + struct machine *machine, struct comm *comm, + struct thread *main_thread); int db_export__exec_comm(struct db_export *dbe, struct comm *comm, struct thread *main_thread); int db_export__comm_thread(struct db_export *dbe, struct comm *comm, -- cgit v1.2.3 From 19207d86940db9dad5f2e0a270a2490f7da451e3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:53 +0300 Subject: perf db-export: Export main_thread in db_export__sample() Export main_thread in db_export__sample() because it makes the code easier to understand, and prepares db_export__thread() for further simplification. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 14501236c046..63f9edf65eee 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -71,16 +71,10 @@ int db_export__thread(struct db_export *dbe, struct thread *thread, thread->db_id = ++dbe->thread_last_db_id; if (main_thread) { - if (main_thread != thread) { - err = db_export__thread(dbe, main_thread, machine, - comm, main_thread); + if (main_thread != thread && comm) { + err = db_export__comm_thread(dbe, comm, thread); if (err) return err; - if (comm) { - err = db_export__comm_thread(dbe, comm, thread); - if (err) - return err; - } } main_thread_db_id = main_thread->db_id; } @@ -308,12 +302,24 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, return err; main_thread = thread__main_thread(al->machine, thread); - if (main_thread) + if (main_thread) { comm = machine__thread_exec_comm(al->machine, main_thread); + /* + * A thread has a reference to the main thread, so export the + * main thread first. + */ + err = db_export__thread(dbe, main_thread, al->machine, comm, + main_thread); + if (err) + goto out_put; + } - err = db_export__thread(dbe, thread, al->machine, comm, main_thread); - if (err) - goto out_put; + if (thread != main_thread) { + err = db_export__thread(dbe, thread, al->machine, comm, + main_thread); + if (err) + goto out_put; + } if (comm) { err = db_export__exec_comm(dbe, comm, main_thread); -- cgit v1.2.3 From 6319790bcf825bcc4cd9bf01f01ae404a2fb7da8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:54 +0300 Subject: perf db-export: Export comm before exporting thread Export comm before exporting the non-main thread because db_export__thread() also exports the comm_thread. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 63f9edf65eee..99ad759561de 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -312,6 +312,12 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, main_thread); if (err) goto out_put; + if (comm) { + err = db_export__exec_comm(dbe, comm, main_thread); + if (err) + goto out_put; + es.comm_db_id = comm->db_id; + } } if (thread != main_thread) { @@ -321,13 +327,6 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, goto out_put; } - if (comm) { - err = db_export__exec_comm(dbe, comm, main_thread); - if (err) - goto out_put; - es.comm_db_id = comm->db_id; - } - es.db_id = ++dbe->sample_last_db_id; err = db_ids_from_al(dbe, al, &es.dso_db_id, &es.sym_db_id, &es.offset); -- cgit v1.2.3 From 1ed119589834e25c130fdaa911ca8b0e3fd1cddf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:55 +0300 Subject: perf db-export: Move export__comm_thread into db_export__sample() Move call to db_export__comm_thread() from db_export__thread() into db_export__sample() because it makes the code easier to understand, and add explanatory comments. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 35 +++++++++++++++++++++-------------- tools/perf/util/db-export.h | 3 +-- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 99ad759561de..78f62a733b9d 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -59,25 +59,17 @@ int db_export__machine(struct db_export *dbe, struct machine *machine) } int db_export__thread(struct db_export *dbe, struct thread *thread, - struct machine *machine, struct comm *comm, - struct thread *main_thread) + struct machine *machine, struct thread *main_thread) { u64 main_thread_db_id = 0; - int err; if (thread->db_id) return 0; thread->db_id = ++dbe->thread_last_db_id; - if (main_thread) { - if (main_thread != thread && comm) { - err = db_export__comm_thread(dbe, comm, thread); - if (err) - return err; - } + if (main_thread) main_thread_db_id = main_thread->db_id; - } if (dbe->export_thread) return dbe->export_thread(dbe, thread, main_thread_db_id, @@ -303,15 +295,19 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, main_thread = thread__main_thread(al->machine, thread); if (main_thread) { - comm = machine__thread_exec_comm(al->machine, main_thread); /* * A thread has a reference to the main thread, so export the * main thread first. */ - err = db_export__thread(dbe, main_thread, al->machine, comm, + err = db_export__thread(dbe, main_thread, al->machine, main_thread); if (err) goto out_put; + /* + * Export comm before exporting the non-main thread because + * db_export__comm_thread() can be called further below. + */ + comm = machine__thread_exec_comm(al->machine, main_thread); if (comm) { err = db_export__exec_comm(dbe, comm, main_thread); if (err) @@ -321,10 +317,21 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, } if (thread != main_thread) { - err = db_export__thread(dbe, thread, al->machine, comm, - main_thread); + /* + * For a non-main thread, db_export__comm_thread() must be + * called only if thread has not previously been exported. + */ + bool export_comm_thread = comm && !thread->db_id; + + err = db_export__thread(dbe, thread, al->machine, main_thread); if (err) goto out_put; + + if (export_comm_thread) { + err = db_export__comm_thread(dbe, comm, thread); + if (err) + goto out_put; + } } es.db_id = ++dbe->sample_last_db_id; diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 6e267321594c..811a678a910d 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -75,8 +75,7 @@ void db_export__exit(struct db_export *dbe); int db_export__evsel(struct db_export *dbe, struct perf_evsel *evsel); int db_export__machine(struct db_export *dbe, struct machine *machine); int db_export__thread(struct db_export *dbe, struct thread *thread, - struct machine *machine, struct comm *comm, - struct thread *main_thread); + struct machine *machine, struct thread *main_thread); int db_export__exec_comm(struct db_export *dbe, struct comm *comm, struct thread *main_thread); int db_export__comm_thread(struct db_export *dbe, struct comm *comm, -- cgit v1.2.3 From a5defb2f3984e0f056e4113b54c461782796c7be Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:56 +0300 Subject: perf db-export: Fix a white space issue in db_export__sample() Fix a white space issue in db_export__sample() Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 78f62a733b9d..2c3a4ad68428 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -274,7 +274,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, struct addr_location *al) { - struct thread* thread = al->thread; + struct thread *thread = al->thread; struct export_sample es = { .event = event, .sample = sample, -- cgit v1.2.3 From 8ebf5cc0f6ce469d65ba2e8ce519dae34f0b3f50 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:57 +0300 Subject: perf db-export: Export comm details In preparation for exporting the current comm for a thread, export comm thread id, start time and exec flag. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-9-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 2 +- tools/perf/util/db-export.h | 3 ++- tools/perf/util/scripting-engines/trace-event-python.c | 8 ++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 2c3a4ad68428..b0504d3eb130 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -95,7 +95,7 @@ int db_export__exec_comm(struct db_export *dbe, struct comm *comm, comm->db_id = ++dbe->comm_last_db_id; if (dbe->export_comm) { - err = dbe->export_comm(dbe, comm); + err = dbe->export_comm(dbe, comm, main_thread); if (err) return err; } diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 811a678a910d..29f7c3b035a7 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -43,7 +43,8 @@ struct db_export { int (*export_machine)(struct db_export *dbe, struct machine *machine); int (*export_thread)(struct db_export *dbe, struct thread *thread, u64 main_thread_db_id, struct machine *machine); - int (*export_comm)(struct db_export *dbe, struct comm *comm); + int (*export_comm)(struct db_export *dbe, struct comm *comm, + struct thread *thread); int (*export_comm_thread)(struct db_export *dbe, u64 db_id, struct comm *comm, struct thread *thread); int (*export_dso)(struct db_export *dbe, struct dso *dso, diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c9837f0f0fd6..28167e938cef 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1011,15 +1011,19 @@ static int python_export_thread(struct db_export *dbe, struct thread *thread, return 0; } -static int python_export_comm(struct db_export *dbe, struct comm *comm) +static int python_export_comm(struct db_export *dbe, struct comm *comm, + struct thread *thread) { struct tables *tables = container_of(dbe, struct tables, dbe); PyObject *t; - t = tuple_new(2); + t = tuple_new(5); tuple_set_u64(t, 0, comm->db_id); tuple_set_string(t, 1, comm__str(comm)); + tuple_set_u64(t, 2, thread->db_id); + tuple_set_u64(t, 3, comm->start); + tuple_set_s32(t, 4, comm->exec); call_object(tables->comm_handler, t, "comm_table"); -- cgit v1.2.3 From 41085f2bdd5882632e7dd88d1e5b59b7eac2a2a9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:58 +0300 Subject: perf scripts python: export-to-sqlite.py: Export comm details Add table columns for thread id, comm start time and exec flag. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-10-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-sqlite.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 021326c46285..97aa66dd2fe1 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -177,7 +177,10 @@ do_query(query, 'CREATE TABLE threads (' 'tid integer)') do_query(query, 'CREATE TABLE comms (' 'id integer NOT NULL PRIMARY KEY,' - 'comm varchar(16))') + 'comm varchar(16),' + 'c_thread_id bigint,' + 'c_time bigint,' + 'exec_flag boolean)') do_query(query, 'CREATE TABLE comm_threads (' 'id integer NOT NULL PRIMARY KEY,' 'comm_id bigint,' @@ -536,7 +539,7 @@ machine_query.prepare("INSERT INTO machines VALUES (?, ?, ?)") thread_query = QSqlQuery(db) thread_query.prepare("INSERT INTO threads VALUES (?, ?, ?, ?, ?)") comm_query = QSqlQuery(db) -comm_query.prepare("INSERT INTO comms VALUES (?, ?)") +comm_query.prepare("INSERT INTO comms VALUES (?, ?, ?, ?, ?)") comm_thread_query = QSqlQuery(db) comm_thread_query.prepare("INSERT INTO comm_threads VALUES (?, ?, ?)") dso_query = QSqlQuery(db) @@ -576,7 +579,7 @@ def trace_begin(): evsel_table(0, "unknown") machine_table(0, 0, "unknown") thread_table(0, 0, 0, -1, -1) - comm_table(0, "unknown") + comm_table(0, "unknown", 0, 0, 0) dso_table(0, 0, "unknown", "unknown", "") symbol_table(0, 0, 0, 0, 0, "unknown") sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -642,7 +645,7 @@ def thread_table(*x): bind_exec(thread_query, 5, x) def comm_table(*x): - bind_exec(comm_query, 2, x) + bind_exec(comm_query, 5, x) def comm_thread_table(*x): bind_exec(comm_thread_query, 3, x) -- cgit v1.2.3 From 8534b5de81802a82b13fe05acc3e749e3baf980e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:57:59 +0300 Subject: perf scripts python: export-to-postgresql.py: Export comm details Add table columns for thread id, comm start time and exec flag. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-11-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 92713d93e956..01f37877f5bb 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -353,7 +353,10 @@ do_query(query, 'CREATE TABLE threads (' 'tid integer)') do_query(query, 'CREATE TABLE comms (' 'id bigint NOT NULL,' - 'comm varchar(16))') + 'comm varchar(16),' + 'c_thread_id bigint,' + 'c_time bigint,' + 'exec_flag boolean)') do_query(query, 'CREATE TABLE comm_threads (' 'id bigint NOT NULL,' 'comm_id bigint,' @@ -763,7 +766,7 @@ def trace_begin(): evsel_table(0, "unknown") machine_table(0, 0, "unknown") thread_table(0, 0, 0, -1, -1) - comm_table(0, "unknown") + comm_table(0, "unknown", 0, 0, 0) dso_table(0, 0, "unknown", "unknown", "") symbol_table(0, 0, 0, 0, 0, "unknown") sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -851,6 +854,8 @@ def trace_end(): do_query(query, 'ALTER TABLE threads ' 'ADD CONSTRAINT machinefk FOREIGN KEY (machine_id) REFERENCES machines (id),' 'ADD CONSTRAINT processfk FOREIGN KEY (process_id) REFERENCES threads (id)') + do_query(query, 'ALTER TABLE comms ' + 'ADD CONSTRAINT threadfk FOREIGN KEY (c_thread_id) REFERENCES threads (id)') do_query(query, 'ALTER TABLE comm_threads ' 'ADD CONSTRAINT commfk FOREIGN KEY (comm_id) REFERENCES comms (id),' 'ADD CONSTRAINT threadfk FOREIGN KEY (thread_id) REFERENCES threads (id)') @@ -935,11 +940,11 @@ def thread_table(thread_id, machine_id, process_id, pid, tid, *x): value = struct.pack("!hiqiqiqiiii", 5, 8, thread_id, 8, machine_id, 8, process_id, 4, pid, 4, tid) thread_file.write(value) -def comm_table(comm_id, comm_str, *x): +def comm_table(comm_id, comm_str, thread_id, time, exec_flag, *x): comm_str = toserverstr(comm_str) n = len(comm_str) - fmt = "!hiqi" + str(n) + "s" - value = struct.pack(fmt, 2, 8, comm_id, n, comm_str) + fmt = "!hiqi" + str(n) + "s" + "iqiqiB" + value = struct.pack(fmt, 5, 8, comm_id, n, comm_str, 8, thread_id, 8, time, 1, exec_flag) comm_file.write(value) def comm_thread_table(comm_thread_id, comm_id, thread_id, *x): -- cgit v1.2.3 From 80859c947a1eb170927d03e713abf7550a3d8766 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:00 +0300 Subject: perf db-export: Factor out db_export__comm() In preparation for exporting the current comm for a thread, factor out db_export__comm(). Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-12-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 30 +++++++++++++++++++++++------- tools/perf/util/db-export.h | 2 ++ 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index b0504d3eb130..b1e581c13963 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -78,6 +78,26 @@ int db_export__thread(struct db_export *dbe, struct thread *thread, return 0; } +static int __db_export__comm(struct db_export *dbe, struct comm *comm, + struct thread *thread) +{ + comm->db_id = ++dbe->comm_last_db_id; + + if (dbe->export_comm) + return dbe->export_comm(dbe, comm, thread); + + return 0; +} + +int db_export__comm(struct db_export *dbe, struct comm *comm, + struct thread *thread) +{ + if (comm->db_id) + return 0; + + return __db_export__comm(dbe, comm, thread); +} + /* * Export the "exec" comm. The "exec" comm is the program / application command * name at the time it first executes. It is used to group threads for the same @@ -92,13 +112,9 @@ int db_export__exec_comm(struct db_export *dbe, struct comm *comm, if (comm->db_id) return 0; - comm->db_id = ++dbe->comm_last_db_id; - - if (dbe->export_comm) { - err = dbe->export_comm(dbe, comm, main_thread); - if (err) - return err; - } + err = __db_export__comm(dbe, comm, main_thread); + if (err) + return err; /* * Record the main thread for this comm. Note that the main thread can diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 29f7c3b035a7..f5f0865f07e1 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -77,6 +77,8 @@ int db_export__evsel(struct db_export *dbe, struct perf_evsel *evsel); int db_export__machine(struct db_export *dbe, struct machine *machine); int db_export__thread(struct db_export *dbe, struct thread *thread, struct machine *machine, struct thread *main_thread); +int db_export__comm(struct db_export *dbe, struct comm *comm, + struct thread *thread); int db_export__exec_comm(struct db_export *dbe, struct comm *comm, struct thread *main_thread); int db_export__comm_thread(struct db_export *dbe, struct comm *comm, -- cgit v1.2.3 From 4650c7bed79582c74452d284e45d5b76987c0ef3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:01 +0300 Subject: perf db-export: Also export thread's current comm Currently, the initial comm of the main thread is exported. Export also a thread's current comm. That better supports the tracing of multi-threaded applications that set different comms for different threads to make it easier to distinguish them. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-13-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index b1e581c13963..5057fdd7f62d 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -299,6 +299,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, }; struct thread *main_thread; struct comm *comm = NULL; + struct comm *curr_comm; int err; err = db_export__evsel(dbe, evsel); @@ -350,6 +351,13 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, } } + curr_comm = thread__comm(thread); + if (curr_comm) { + err = db_export__comm(dbe, curr_comm, thread); + if (err) + goto out_put; + } + es.db_id = ++dbe->sample_last_db_id; err = db_ids_from_al(dbe, al, &es.dso_db_id, &es.sym_db_id, &es.offset); -- cgit v1.2.3 From ecc8c9984dae9812a10936cb9c74957b68075e07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:02 +0300 Subject: perf scripts python: export-to-sqlite.py: Add has_calls column to comms table Now that a thread's current comm is exported, it shows up in the call graph and call tree even if it has no calls. That can happen because the calls are recorded against the main thread's initial comm. Add a table column to make it easy for the exported-sql-viewer.py script to select only comms with calls. Committer notes: Running the export-to-sqlite.py worked without warnings and using the exported-sql-viewer.py worked as before. Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-14-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-sqlite.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 97aa66dd2fe1..9156f6a1e5f0 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -606,6 +606,8 @@ def trace_end(): if perf_db_export_calls: do_query(query, 'CREATE INDEX pcpid_idx ON calls (parent_call_path_id)') do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') + do_query(query, 'ALTER TABLE comms ADD has_calls boolean') + do_query(query, 'UPDATE comms SET has_calls = 1 WHERE comms.id IN (SELECT DISTINCT comm_id FROM calls)') printdate("Dropping unused tables") if is_table_empty("ptwrite"): -- cgit v1.2.3 From d9efc1d25214da500d6592095b542b32c15459df Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:03 +0300 Subject: perf scripts python: export-to-postgresql.py: Add has_calls column to comms table Now that a thread's current comm is exported, it shows up in the call graph and call tree even if it has no calls. That can happen because the calls are recorded against the main thread's initial comm. Add a table column to make it easy for the exported-sql-viewer.py script to select only comms with calls. Committer testing: $ rm -f simple-retpoline.db $ sudo ~acme/bin/perf script -i simple-retpoline.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py simple-retpoline.db branches calls 2019-07-10 12:25:33.200529 Creating database ... 2019-07-10 12:25:33.211548 Writing records... 2019-07-10 12:25:33.549630 Adding indexes 2019-07-10 12:25:33.560715 Dropping unused tables 2019-07-10 12:25:33.580201 Done $ sha256sum tools/perf/scripts/python/export-to-sqlite.py ~/libexec/perf-core/scripts/python/export-to-sqlite.py 2922b642c392004dffa1d8789296478c85904623f5895bcb9b6cbf33e3ca999f tools/perf/scripts/python/export-to-sqlite.py 2922b642c392004dffa1d8789296478c85904623f5895bcb9b6cbf33e3ca999f /home/acme/libexec/perf-core/scripts/python/export-to-sqlite.py $ $ sqlite3 simple-retpoline.db SQLite version 3.26.0 2018-12-01 12:34:55 Enter ".help" for usage hints. sqlite> .schema comms CREATE TABLE comms (id integer NOT NULL PRIMARY KEY,comm varchar(16),c_thread_id bigint,c_time bigint,exec_flag boolean, has_calls boolean); sqlite> select id,has_calls from comms; 0|1 1|1 sqlite> select distinct comm_id from calls; 0 1 sqlite> Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-15-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 01f37877f5bb..13205e4e5b3b 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -886,6 +886,8 @@ def trace_end(): 'ADD CONSTRAINT parent_call_pathfk FOREIGN KEY (parent_call_path_id) REFERENCES call_paths (id)') do_query(query, 'CREATE INDEX pcpid_idx ON calls (parent_call_path_id)') do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') + do_query(query, 'ALTER TABLE comms ADD has_calls boolean') + do_query(query, 'UPDATE comms SET has_calls = TRUE WHERE comms.id IN (SELECT DISTINCT comm_id FROM calls)') do_query(query, 'ALTER TABLE ptwrite ' 'ADD CONSTRAINT idfk FOREIGN KEY (id) REFERENCES samples (id)') do_query(query, 'ALTER TABLE cbr ' -- cgit v1.2.3 From 266887291cac7f4020b5c83d2af9a13aece44a74 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:04 +0300 Subject: perf scripts python: exported-sql-viewer.py: Remove redundant semi-colons Remove redundant semi-colons added inadvertently. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-16-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/exported-sql-viewer.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index 6e7934f2ac9a..dbbd7a5d9b60 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -392,7 +392,7 @@ class FindBar(): self.hbox.addWidget(self.close_button) self.bar = QWidget() - self.bar.setLayout(self.hbox); + self.bar.setLayout(self.hbox) self.bar.hide() def Widget(self): @@ -470,7 +470,7 @@ class CallGraphLevelItemBase(object): self.params = params self.row = row self.parent_item = parent_item - self.query_done = False; + self.query_done = False self.child_count = 0 self.child_items = [] if parent_item: @@ -517,7 +517,7 @@ class CallGraphLevelTwoPlusItemBase(CallGraphLevelItemBase): self.time = time def Select(self): - self.query_done = True; + self.query_done = True query = QSqlQuery(self.glb.db) if self.params.have_ipc: ipc_str = ", SUM(insn_count), SUM(cyc_count)" @@ -604,7 +604,7 @@ class CallGraphLevelOneItem(CallGraphLevelItemBase): self.dbid = comm_id def Select(self): - self.query_done = True; + self.query_done = True query = QSqlQuery(self.glb.db) QueryExec(query, "SELECT thread_id, pid, tid" " FROM comm_threads" @@ -622,7 +622,7 @@ class CallGraphRootItem(CallGraphLevelItemBase): def __init__(self, glb, params): super(CallGraphRootItem, self).__init__(glb, params, 0, None) self.dbid = 0 - self.query_done = True; + self.query_done = True query = QSqlQuery(glb.db) QueryExec(query, "SELECT id, comm FROM comms") while query.next(): @@ -793,7 +793,7 @@ class CallTreeLevelTwoPlusItemBase(CallGraphLevelItemBase): self.time = time def Select(self): - self.query_done = True; + self.query_done = True if self.calls_id == 0: comm_thread = " AND comm_id = " + str(self.comm_id) + " AND thread_id = " + str(self.thread_id) else: @@ -881,7 +881,7 @@ class CallTreeLevelOneItem(CallGraphLevelItemBase): self.dbid = comm_id def Select(self): - self.query_done = True; + self.query_done = True query = QSqlQuery(self.glb.db) QueryExec(query, "SELECT thread_id, pid, tid" " FROM comm_threads" @@ -899,7 +899,7 @@ class CallTreeRootItem(CallGraphLevelItemBase): def __init__(self, glb, params): super(CallTreeRootItem, self).__init__(glb, params, 0, None) self.dbid = 0 - self.query_done = True; + self.query_done = True query = QSqlQuery(glb.db) QueryExec(query, "SELECT id, comm FROM comms") while query.next(): @@ -971,7 +971,7 @@ class VBox(): def __init__(self, w1, w2, w3=None): self.vbox = QWidget() - self.vbox.setLayout(QVBoxLayout()); + self.vbox.setLayout(QVBoxLayout()) self.vbox.layout().setContentsMargins(0, 0, 0, 0) @@ -1391,7 +1391,7 @@ class FetchMoreRecordsBar(): self.hbox.addWidget(self.close_button) self.bar = QWidget() - self.bar.setLayout(self.hbox); + self.bar.setLayout(self.hbox) self.bar.show() self.in_progress = False @@ -2206,7 +2206,7 @@ class ReportDialogBase(QDialog): self.vbox.addLayout(self.grid) self.vbox.addLayout(self.hbox) - self.setLayout(self.vbox); + self.setLayout(self.vbox) def Ok(self): vars = self.report_vars @@ -3139,7 +3139,7 @@ class AboutDialog(QDialog): self.vbox = QVBoxLayout() self.vbox.addWidget(self.text) - self.setLayout(self.vbox); + self.setLayout(self.vbox) # Font resize -- cgit v1.2.3 From 26c11206f433ea507a7541f48cb472b85870577e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:05 +0300 Subject: perf scripts python: exported-sql-viewer.py: Use new 'has_calls' column If the new 'has_calls' column is present, use it with the call graph and call tree to select only comms that have calls. Committer testing: Just started the exported-sql-view.py and accessed all the reports, no backtraces. Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-17-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/exported-sql-viewer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index dbbd7a5d9b60..61b3911d91e6 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -623,8 +623,11 @@ class CallGraphRootItem(CallGraphLevelItemBase): super(CallGraphRootItem, self).__init__(glb, params, 0, None) self.dbid = 0 self.query_done = True + if_has_calls = "" + if IsSelectable(glb.db, "comms", columns = "has_calls"): + if_has_calls = " WHERE has_calls = TRUE" query = QSqlQuery(glb.db) - QueryExec(query, "SELECT id, comm FROM comms") + QueryExec(query, "SELECT id, comm FROM comms" + if_has_calls) while query.next(): if not query.value(0): continue @@ -900,8 +903,11 @@ class CallTreeRootItem(CallGraphLevelItemBase): super(CallTreeRootItem, self).__init__(glb, params, 0, None) self.dbid = 0 self.query_done = True + if_has_calls = "" + if IsSelectable(glb.db, "comms", columns = "has_calls"): + if_has_calls = " WHERE has_calls = TRUE" query = QSqlQuery(glb.db) - QueryExec(query, "SELECT id, comm FROM comms") + QueryExec(query, "SELECT id, comm FROM comms" + if_has_calls) while query.next(): if not query.value(0): continue -- cgit v1.2.3 From 5bf83c29a0ad2e78683c318b607539dbadbf7a3b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:06 +0300 Subject: perf script: Add scripting operation process_switch() Add scripting operation process_switch() to process switch events. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-18-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 8 +++++++- tools/perf/util/trace-event.h | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 79367087bd18..8f24865596af 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2289,6 +2289,12 @@ static int process_switch_event(struct perf_tool *tool, if (perf_event__process_switch(tool, event, sample, machine) < 0) return -1; + if (scripting_ops && scripting_ops->process_switch) + scripting_ops->process_switch(event, sample, machine); + + if (!script->show_switch_events) + return 0; + thread = machine__findnew_thread(machine, sample->pid, sample->tid); if (thread == NULL) { @@ -2467,7 +2473,7 @@ static int __cmd_script(struct perf_script *script) script->tool.mmap = process_mmap_event; script->tool.mmap2 = process_mmap2_event; } - if (script->show_switch_events) + if (script->show_switch_events || (scripting_ops && scripting_ops->process_switch)) script->tool.context_switch = process_switch_event; if (script->show_namespace_events) script->tool.namespaces = process_namespaces_event; diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index d9b0a942090a..c7002fe11673 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -81,6 +81,9 @@ struct scripting_ops { struct perf_sample *sample, struct perf_evsel *evsel, struct addr_location *al); + void (*process_switch)(union perf_event *event, + struct perf_sample *sample, + struct machine *machine); void (*process_stat)(struct perf_stat_config *config, struct perf_evsel *evsel, u64 tstamp); void (*process_stat_interval)(u64 tstamp); -- cgit v1.2.3 From b3694e6c0a05383891546c6e3cdef8659d50b653 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:07 +0300 Subject: perf db-export: Factor out db_export__threads() In preparation for exporting switch events, factor out db_export__threads(). Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-19-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 82 ++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 5057fdd7f62d..e6a9c450133e 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -286,50 +286,32 @@ int db_export__branch_type(struct db_export *dbe, u32 branch_type, return 0; } -int db_export__sample(struct db_export *dbe, union perf_event *event, - struct perf_sample *sample, struct perf_evsel *evsel, - struct addr_location *al) +static int db_export__threads(struct db_export *dbe, struct thread *thread, + struct thread *main_thread, + struct machine *machine, struct comm **comm_ptr) { - struct thread *thread = al->thread; - struct export_sample es = { - .event = event, - .sample = sample, - .evsel = evsel, - .al = al, - }; - struct thread *main_thread; struct comm *comm = NULL; struct comm *curr_comm; int err; - err = db_export__evsel(dbe, evsel); - if (err) - return err; - - err = db_export__machine(dbe, al->machine); - if (err) - return err; - - main_thread = thread__main_thread(al->machine, thread); if (main_thread) { /* * A thread has a reference to the main thread, so export the * main thread first. */ - err = db_export__thread(dbe, main_thread, al->machine, - main_thread); + err = db_export__thread(dbe, main_thread, machine, main_thread); if (err) - goto out_put; + return err; /* * Export comm before exporting the non-main thread because * db_export__comm_thread() can be called further below. */ - comm = machine__thread_exec_comm(al->machine, main_thread); + comm = machine__thread_exec_comm(machine, main_thread); if (comm) { err = db_export__exec_comm(dbe, comm, main_thread); if (err) - goto out_put; - es.comm_db_id = comm->db_id; + return err; + *comm_ptr = comm; } } @@ -340,23 +322,55 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, */ bool export_comm_thread = comm && !thread->db_id; - err = db_export__thread(dbe, thread, al->machine, main_thread); + err = db_export__thread(dbe, thread, machine, main_thread); if (err) - goto out_put; + return err; if (export_comm_thread) { err = db_export__comm_thread(dbe, comm, thread); if (err) - goto out_put; + return err; } } curr_comm = thread__comm(thread); - if (curr_comm) { - err = db_export__comm(dbe, curr_comm, thread); - if (err) - goto out_put; - } + if (curr_comm) + return db_export__comm(dbe, curr_comm, thread); + + return 0; +} + +int db_export__sample(struct db_export *dbe, union perf_event *event, + struct perf_sample *sample, struct perf_evsel *evsel, + struct addr_location *al) +{ + struct thread *thread = al->thread; + struct export_sample es = { + .event = event, + .sample = sample, + .evsel = evsel, + .al = al, + }; + struct thread *main_thread; + struct comm *comm = NULL; + int err; + + err = db_export__evsel(dbe, evsel); + if (err) + return err; + + err = db_export__machine(dbe, al->machine); + if (err) + return err; + + main_thread = thread__main_thread(al->machine, thread); + + err = db_export__threads(dbe, thread, main_thread, al->machine, &comm); + if (err) + goto out_put; + + if (comm) + es.comm_db_id = comm->db_id; es.db_id = ++dbe->sample_last_db_id; -- cgit v1.2.3 From abde8722d9b0a317935506d9824e26f1aef6c24a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:08 +0300 Subject: perf db-export: Export switch events Export details of switch events including the threads and their current comms. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-20-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 89 ++++++++++++++++++++++ tools/perf/util/db-export.h | 8 ++ .../util/scripting-engines/trace-event-python.c | 41 ++++++++++ 3 files changed, 138 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index e6a9c450133e..ffbb3e7d3288 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -519,3 +519,92 @@ int db_export__call_return(struct db_export *dbe, struct call_return *cr, return 0; } + +static int db_export__pid_tid(struct db_export *dbe, struct machine *machine, + pid_t pid, pid_t tid, u64 *db_id, + struct comm **comm_ptr, bool *is_idle) +{ + struct thread *thread = machine__find_thread(machine, pid, tid); + struct thread *main_thread; + int err = 0; + + if (!thread || !thread->comm_set) + goto out_put; + + *is_idle = !thread->pid_ && !thread->tid; + + main_thread = thread__main_thread(machine, thread); + + err = db_export__threads(dbe, thread, main_thread, machine, comm_ptr); + + *db_id = thread->db_id; + + thread__put(main_thread); +out_put: + thread__put(thread); + + return err; +} + +int db_export__switch(struct db_export *dbe, union perf_event *event, + struct perf_sample *sample, struct machine *machine) +{ + bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT; + bool out_preempt = out && + (event->header.misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT); + int flags = out | (out_preempt << 1); + bool is_idle_a = false, is_idle_b = false; + u64 th_a_id = 0, th_b_id = 0; + u64 comm_out_id, comm_in_id; + struct comm *comm_a = NULL; + struct comm *comm_b = NULL; + u64 th_out_id, th_in_id; + u64 db_id; + int err; + + err = db_export__machine(dbe, machine); + if (err) + return err; + + err = db_export__pid_tid(dbe, machine, sample->pid, sample->tid, + &th_a_id, &comm_a, &is_idle_a); + if (err) + return err; + + if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) { + pid_t pid = event->context_switch.next_prev_pid; + pid_t tid = event->context_switch.next_prev_tid; + + err = db_export__pid_tid(dbe, machine, pid, tid, &th_b_id, + &comm_b, &is_idle_b); + if (err) + return err; + } + + /* + * Do not export if both threads are unknown (i.e. not being traced), + * or one is unknown and the other is the idle task. + */ + if ((!th_a_id || is_idle_a) && (!th_b_id || is_idle_b)) + return 0; + + db_id = ++dbe->context_switch_last_db_id; + + if (out) { + th_out_id = th_a_id; + th_in_id = th_b_id; + comm_out_id = comm_a ? comm_a->db_id : 0; + comm_in_id = comm_b ? comm_b->db_id : 0; + } else { + th_out_id = th_b_id; + th_in_id = th_a_id; + comm_out_id = comm_b ? comm_b->db_id : 0; + comm_in_id = comm_a ? comm_a->db_id : 0; + } + + if (dbe->export_context_switch) + return dbe->export_context_switch(dbe, db_id, machine, sample, + th_out_id, comm_out_id, + th_in_id, comm_in_id, flags); + return 0; +} diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index f5f0865f07e1..ba1f62a5fe10 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -57,6 +57,11 @@ struct db_export { int (*export_call_path)(struct db_export *dbe, struct call_path *cp); int (*export_call_return)(struct db_export *dbe, struct call_return *cr); + int (*export_context_switch)(struct db_export *dbe, u64 db_id, + struct machine *machine, + struct perf_sample *sample, + u64 th_out_id, u64 comm_out_id, + u64 th_in_id, u64 comm_in_id, int flags); struct call_return_processor *crp; struct call_path_root *cpr; u64 evsel_last_db_id; @@ -69,6 +74,7 @@ struct db_export { u64 sample_last_db_id; u64 call_path_last_db_id; u64 call_return_last_db_id; + u64 context_switch_last_db_id; }; int db_export__init(struct db_export *dbe); @@ -98,5 +104,7 @@ int db_export__branch_types(struct db_export *dbe); int db_export__call_path(struct db_export *dbe, struct call_path *cp); int db_export__call_return(struct db_export *dbe, struct call_return *cr, u64 *parent_db_id); +int db_export__switch(struct db_export *dbe, union perf_event *event, + struct perf_sample *sample, struct machine *machine); #endif diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 28167e938cef..25dc1d765553 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -113,6 +113,7 @@ struct tables { PyObject *call_path_handler; PyObject *call_return_handler; PyObject *synth_handler; + PyObject *context_switch_handler; bool db_export_mode; }; @@ -1237,6 +1238,34 @@ static int python_export_call_return(struct db_export *dbe, return 0; } +static int python_export_context_switch(struct db_export *dbe, u64 db_id, + struct machine *machine, + struct perf_sample *sample, + u64 th_out_id, u64 comm_out_id, + u64 th_in_id, u64 comm_in_id, int flags) +{ + struct tables *tables = container_of(dbe, struct tables, dbe); + PyObject *t; + + t = tuple_new(9); + + tuple_set_u64(t, 0, db_id); + tuple_set_u64(t, 1, machine->db_id); + tuple_set_u64(t, 2, sample->time); + tuple_set_s32(t, 3, sample->cpu); + tuple_set_u64(t, 4, th_out_id); + tuple_set_u64(t, 5, comm_out_id); + tuple_set_u64(t, 6, th_in_id); + tuple_set_u64(t, 7, comm_in_id); + tuple_set_s32(t, 8, flags); + + call_object(tables->context_switch_handler, t, "context_switch"); + + Py_DECREF(t); + + return 0; +} + static int python_process_call_return(struct call_return *cr, u64 *parent_db_id, void *data) { @@ -1300,6 +1329,16 @@ static void python_process_event(union perf_event *event, } } +static void python_process_switch(union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + struct tables *tables = &tables_global; + + if (tables->db_export_mode) + db_export__switch(&tables->dbe, event, sample, machine); +} + static void get_handler_name(char *str, size_t size, struct perf_evsel *evsel) { @@ -1515,6 +1554,7 @@ static void set_table_handlers(struct tables *tables) SET_TABLE_HANDLER(sample); SET_TABLE_HANDLER(call_path); SET_TABLE_HANDLER(call_return); + SET_TABLE_HANDLER(context_switch); /* * Synthesized events are samples but with architecture-specific data @@ -1833,6 +1873,7 @@ struct scripting_ops python_scripting_ops = { .flush_script = python_flush_script, .stop_script = python_stop_script, .process_event = python_process_event, + .process_switch = python_process_switch, .process_stat = python_process_stat, .process_stat_interval = python_process_stat_interval, .generate_script = python_generate_script, -- cgit v1.2.3 From 37c1f991b1bcdbe268b99b22e265738f4209f4f4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:09 +0300 Subject: perf scripts python: export-to-sqlite.py: Export switch events Export switch events to a new table 'context_switches' and create a view 'context_switches_view'. The table and view will show automatically in the exported-sql-viewer.py script. If the table ends up empty, then it and the view are dropped. Committer testing: Use the exported-sql-viewer.py and look at "Tables" -> "context_switches": id machine_id time cpu thread_out_id comm_out_id thread_in_id comm_in_id flags 1 1 187836111885918 7 1 1 2 2 3 2 1 187836111889369 7 1 1 2 2 0 3 1 187836112464618 7 2 3 1 1 1 4 1 187836112465511 7 2 3 1 1 0 Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-21-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-sqlite.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 9156f6a1e5f0..8043a7272a56 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -306,6 +306,17 @@ do_query(query, 'CREATE TABLE pwrx (' 'last_cstate integer,' 'wake_reason integer)') +do_query(query, 'CREATE TABLE context_switches (' + 'id integer NOT NULL PRIMARY KEY,' + 'machine_id bigint,' + 'time bigint,' + 'cpu integer,' + 'thread_out_id bigint,' + 'comm_out_id bigint,' + 'thread_in_id bigint,' + 'comm_in_id bigint,' + 'flags integer)') + # printf was added to sqlite in version 3.8.3 sqlite_has_printf = False try: @@ -530,6 +541,29 @@ do_query(query, 'CREATE VIEW power_events_view AS ' ' INNER JOIN selected_events ON selected_events.id = evsel_id' ' WHERE selected_events.name IN (\'cbr\',\'mwait\',\'exstop\',\'pwre\',\'pwrx\')') +do_query(query, 'CREATE VIEW context_switches_view AS ' + 'SELECT ' + 'context_switches.id,' + 'context_switches.machine_id,' + 'context_switches.time,' + 'context_switches.cpu,' + 'th_out.pid AS pid_out,' + 'th_out.tid AS tid_out,' + 'comm_out.comm AS comm_out,' + 'th_in.pid AS pid_in,' + 'th_in.tid AS tid_in,' + 'comm_in.comm AS comm_in,' + 'CASE WHEN context_switches.flags = 0 THEN \'in\'' + ' WHEN context_switches.flags = 1 THEN \'out\'' + ' WHEN context_switches.flags = 3 THEN \'out preempt\'' + ' ELSE context_switches.flags ' + 'END AS flags' + ' FROM context_switches' + ' INNER JOIN threads AS th_out ON th_out.id = context_switches.thread_out_id' + ' INNER JOIN threads AS th_in ON th_in.id = context_switches.thread_in_id' + ' INNER JOIN comms AS comm_out ON comm_out.id = context_switches.comm_out_id' + ' INNER JOIN comms AS comm_in ON comm_in.id = context_switches.comm_in_id') + do_query(query, 'END TRANSACTION') evsel_query = QSqlQuery(db) @@ -571,6 +605,8 @@ exstop_query = QSqlQuery(db) exstop_query.prepare("INSERT INTO exstop VALUES (?, ?)") pwrx_query = QSqlQuery(db) pwrx_query.prepare("INSERT INTO pwrx VALUES (?, ?, ?, ?)") +context_switch_query = QSqlQuery(db) +context_switch_query.prepare("INSERT INTO context_switches VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)") def trace_begin(): printdate("Writing records...") @@ -620,6 +656,8 @@ def trace_end(): drop("pwrx") if is_table_empty("cbr"): drop("cbr") + if is_table_empty("context_switches"): + drop("context_switches") if (unhandled_count): printdate("Warning: ", unhandled_count, " unhandled events") @@ -753,3 +791,6 @@ def synth_data(id, config, raw_buf, *x): pwrx(id, raw_buf) elif config == 5: cbr(id, raw_buf) + +def context_switch_table(*x): + bind_exec(context_switch_query, 9, x) -- cgit v1.2.3 From 56789f3dc127d4f8c07ce2bb48629ba75e8ef16c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Jul 2019 11:58:10 +0300 Subject: perf scripts python: export-to-postgresql.py: Export switch events Export switch events to a new table 'context_switches' and create a view 'context_switches_view'. The table and view will show automatically in the exported-sql-viewer.py script. If the table ends up empty, then it and the view are dropped. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190710085810.1650-22-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 51 +++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 13205e4e5b3b..7bd73a904b4e 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -482,6 +482,17 @@ do_query(query, 'CREATE TABLE pwrx (' 'last_cstate integer,' 'wake_reason integer)') +do_query(query, 'CREATE TABLE context_switches (' + 'id bigint NOT NULL,' + 'machine_id bigint,' + 'time bigint,' + 'cpu integer,' + 'thread_out_id bigint,' + 'comm_out_id bigint,' + 'thread_in_id bigint,' + 'comm_in_id bigint,' + 'flags integer)') + do_query(query, 'CREATE VIEW machines_view AS ' 'SELECT ' 'id,' @@ -695,6 +706,29 @@ do_query(query, 'CREATE VIEW power_events_view AS ' ' INNER JOIN selected_events ON selected_events.id = samples.evsel_id' ' ORDER BY samples.id') +do_query(query, 'CREATE VIEW context_switches_view AS ' + 'SELECT ' + 'context_switches.id,' + 'context_switches.machine_id,' + 'context_switches.time,' + 'context_switches.cpu,' + 'th_out.pid AS pid_out,' + 'th_out.tid AS tid_out,' + 'comm_out.comm AS comm_out,' + 'th_in.pid AS pid_in,' + 'th_in.tid AS tid_in,' + 'comm_in.comm AS comm_in,' + 'CASE WHEN context_switches.flags = 0 THEN \'in\'' + ' WHEN context_switches.flags = 1 THEN \'out\'' + ' WHEN context_switches.flags = 3 THEN \'out preempt\'' + ' ELSE CAST ( context_switches.flags AS VARCHAR(11) )' + 'END AS flags' + ' FROM context_switches' + ' INNER JOIN threads AS th_out ON th_out.id = context_switches.thread_out_id' + ' INNER JOIN threads AS th_in ON th_in.id = context_switches.thread_in_id' + ' INNER JOIN comms AS comm_out ON comm_out.id = context_switches.comm_out_id' + ' INNER JOIN comms AS comm_in ON comm_in.id = context_switches.comm_in_id') + file_header = struct.pack("!11sii", b"PGCOPY\n\377\r\n\0", 0, 0) file_trailer = b"\377\377" @@ -759,6 +793,7 @@ mwait_file = open_output_file("mwait_table.bin") pwre_file = open_output_file("pwre_table.bin") exstop_file = open_output_file("exstop_table.bin") pwrx_file = open_output_file("pwrx_table.bin") +context_switches_file = open_output_file("context_switches_table.bin") def trace_begin(): printdate("Writing to intermediate files...") @@ -807,6 +842,7 @@ def trace_end(): copy_output_file(pwre_file, "pwre") copy_output_file(exstop_file, "exstop") copy_output_file(pwrx_file, "pwrx") + copy_output_file(context_switches_file, "context_switches") printdate("Removing intermediate files...") remove_output_file(evsel_file) @@ -828,6 +864,7 @@ def trace_end(): remove_output_file(pwre_file) remove_output_file(exstop_file) remove_output_file(pwrx_file) + remove_output_file(context_switches_file) os.rmdir(output_dir_name) printdate("Adding primary keys") do_query(query, 'ALTER TABLE selected_events ADD PRIMARY KEY (id)') @@ -849,6 +886,7 @@ def trace_end(): do_query(query, 'ALTER TABLE pwre ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE exstop ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE pwrx ADD PRIMARY KEY (id)') + do_query(query, 'ALTER TABLE context_switches ADD PRIMARY KEY (id)') printdate("Adding foreign keys") do_query(query, 'ALTER TABLE threads ' @@ -900,6 +938,12 @@ def trace_end(): 'ADD CONSTRAINT idfk FOREIGN KEY (id) REFERENCES samples (id)') do_query(query, 'ALTER TABLE pwrx ' 'ADD CONSTRAINT idfk FOREIGN KEY (id) REFERENCES samples (id)') + do_query(query, 'ALTER TABLE context_switches ' + 'ADD CONSTRAINT machinefk FOREIGN KEY (machine_id) REFERENCES machines (id),' + 'ADD CONSTRAINT toutfk FOREIGN KEY (thread_out_id) REFERENCES threads (id),' + 'ADD CONSTRAINT tinfk FOREIGN KEY (thread_in_id) REFERENCES threads (id),' + 'ADD CONSTRAINT coutfk FOREIGN KEY (comm_out_id) REFERENCES comms (id),' + 'ADD CONSTRAINT cinfk FOREIGN KEY (comm_in_id) REFERENCES comms (id)') printdate("Dropping unused tables") if is_table_empty("ptwrite"): @@ -912,6 +956,8 @@ def trace_end(): drop("pwrx") if is_table_empty("cbr"): drop("cbr") + if is_table_empty("context_switches"): + drop("context_switches") if (unhandled_count): printdate("Warning: ", unhandled_count, " unhandled events") @@ -1058,3 +1104,8 @@ def synth_data(id, config, raw_buf, *x): pwrx(id, raw_buf) elif config == 5: cbr(id, raw_buf) + +def context_switch_table(id, machine_id, time, cpu, thread_out_id, comm_out_id, thread_in_id, comm_in_id, flags, *x): + fmt = "!hiqiqiqiiiqiqiqiqii" + value = struct.pack(fmt, 9, 8, id, 8, machine_id, 8, time, 4, cpu, 8, thread_out_id, 8, comm_out_id, 8, thread_in_id, 8, comm_in_id, 4, flags) + context_switches_file.write(value) -- cgit v1.2.3 From edc82a99437a93c36b0ae18eb6daac0097fc6bd3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Mar 2019 10:31:21 +0800 Subject: perf cs-etm: Remove errnoeous ERR_PTR() usage in cs_etm__process_auxtrace_info intlist__findnew() doesn't uses ERR_PTR() as a return mechanism so its callers shouldn't try to extract the error using PTR_ERR( ret) from intlist__findnew(), make cs_etm__process_auxtrace_info return -ENOMEM instead. Signed-off-by: YueHaibing Reviewed-by: Mathieu Poirier Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Fixes: cd8bfd8c973e ("perf tools: Add processing of coresight metadata") Link: http://lkml.kernel.org/r/20190321023122.21332-2-yuehaibing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 67b88b599a53..2e9f5bc45550 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -2460,7 +2460,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event, /* Something went wrong, no need to continue */ if (!inode) { - err = PTR_ERR(inode); + err = -ENOMEM; goto err_free_metadata; } -- cgit v1.2.3 From 6285bd151b95aa28d6de9b8b9249702681f059d2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Mar 2019 10:31:22 +0800 Subject: perf cs-etm: Return errcode in cs_etm__process_auxtrace_info() The 'err' variable is set in the error path, but it's not returned to callers. Don't always return -EINVAL, return err. Signed-off-by: YueHaibing Reviewed-by: Mathieu Poirier Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Fixes: cd8bfd8c973e ("perf tools: Add processing of coresight metadata") Link: http://lkml.kernel.org/r/20190321023122.21332-3-yuehaibing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 2e9f5bc45550..3d1c34fc4d68 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -2517,8 +2517,10 @@ int cs_etm__process_auxtrace_info(union perf_event *event, session->auxtrace = &etm->auxtrace; etm->unknown_thread = thread__new(999999999, 999999999); - if (!etm->unknown_thread) + if (!etm->unknown_thread) { + err = -ENOMEM; goto err_free_queues; + } /* * Initialize list node so that at thread__zput() we can avoid @@ -2530,8 +2532,10 @@ int cs_etm__process_auxtrace_info(union perf_event *event, if (err) goto err_delete_thread; - if (thread__init_map_groups(etm->unknown_thread, etm->machine)) + if (thread__init_map_groups(etm->unknown_thread, etm->machine)) { + err = -ENOMEM; goto err_delete_thread; + } if (dump_trace) { cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu); @@ -2575,5 +2579,5 @@ err_free_traceid_list: err_free_hdr: zfree(&hdr); - return -EINVAL; + return err; } -- cgit v1.2.3 From 36db2a94f19ae1fcf1e6d8253df0c32f90f92860 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 8 Jul 2019 21:00:07 -0700 Subject: libbpf: fix ptr to u64 conversion warning on 32-bit platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit platforms compiler complains about conversion: libbpf.c: In function ‘perf_event_open_probe’: libbpf.c:4112:17: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] attr.config1 = (uint64_t)(void *)name; /* kprobe_func or uprobe_path */ ^ Reported-by: Matt Hart Fixes: b26500274767 ("libbpf: add kprobe/uprobe attach API") Tested-by: Matt Hart Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ed07789b3e62..794dd5064ae8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4126,8 +4126,8 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, } attr.size = sizeof(attr); attr.type = type; - attr.config1 = (uint64_t)(void *)name; /* kprobe_func or uprobe_path */ - attr.config2 = offset; /* kprobe_addr or probe_offset */ + attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ + attr.config2 = offset; /* kprobe_addr or probe_offset */ /* pid filter is meaningful only for uprobes */ pfd = syscall(__NR_perf_event_open, &attr, -- cgit v1.2.3 From 2b4f3dab09b3a2853c49c866a383ffda8ee54a6f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Jul 2019 10:23:00 -0700 Subject: tools/power/x86/intel-speed-select: Add .gitignore file Add a .gitignore file for build include/ and final binary. Signed-off-by: Prarit Bhargava Acked-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada Cc: Srinivas Pandruvada Cc: Andy Shevchenko Cc: David Arcari Signed-off-by: Andy Shevchenko --- tools/power/x86/intel-speed-select/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/power/x86/intel-speed-select/.gitignore (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/.gitignore b/tools/power/x86/intel-speed-select/.gitignore new file mode 100644 index 000000000000..f61145925ce9 --- /dev/null +++ b/tools/power/x86/intel-speed-select/.gitignore @@ -0,0 +1,2 @@ +include/ +intel-speed-select -- cgit v1.2.3 From 59d82657a08d4a9e6e2d99acc525ef748d36016b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 10 Jul 2019 13:56:54 +0200 Subject: selftests/bpf: fix bpf_target_sparc check bpf_helpers.h fails to compile on sparc: the code should be checking for defined(bpf_target_sparc), but checks simply for bpf_target_sparc. Also change #ifdef bpf_target_powerpc to #if defined() for consistency. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 5a3d92c8bec8..5d63c66bc9d7 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -452,10 +452,10 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #endif -#ifdef bpf_target_powerpc +#if defined(bpf_target_powerpc) #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP -#elif bpf_target_sparc +#elif defined(bpf_target_sparc) #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #else -- cgit v1.2.3 From 216b65fb706e34128a5317d71b300daac9c428c4 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 12 Jul 2019 10:35:39 +0900 Subject: tools: bpftool: add raw_tracepoint_writable prog type to header From commit 9df1c28bb752 ("bpf: add writable context for raw tracepoints"), a new type of BPF_PROG, RAW_TRACEPOINT_WRITABLE has been added. Since this BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE is not listed at bpftool's header, it causes a segfault when executing 'bpftool feature'. This commit adds BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE entry to prog_type_name enum, and will eventually fixes the segfault issue. Fixes: 9df1c28bb752 ("bpf: add writable context for raw tracepoints") Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/main.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 3ef0d9051e10..7031a4bf87a0 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -74,6 +74,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", }; -- cgit v1.2.3 From 9cae4ace80ef39005da106fbb89c952b27d7b89e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 11:12:49 +0200 Subject: selftests/bpf: do not ignore clang failures When compiling an eBPF prog fails, make still returns 0, because failing clang command's output is piped to llc and therefore its exit status is ignored. When clang fails, pipe the string "clang failed" to llc. This will make llc fail with an informative error message. This solution was chosen over using pipefail, having separate targets or getting rid of llc invocation due to its simplicity. In addition, pull Kbuild.include in order to get .DELETE_ON_ERROR target, which would cause partial .o files to be removed. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2620406a53ec..8312db1ae7c2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../../../scripts/Kbuild.include LIBDIR := ../../../lib BPFDIR := $(LIBDIR)/bpf @@ -196,8 +197,8 @@ $(ALU32_BUILD_DIR)/test_progs_32: prog_tests/*.c $(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \ $(ALU32_BUILD_DIR)/test_progs_32 - $(CLANG) $(CLANG_FLAGS) \ - -O2 -target bpf -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mattr=+alu32 -mcpu=$(CPU) $(LLC_FLAGS) \ -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) @@ -208,16 +209,16 @@ endif # Have one program compiled without "-target bpf" to test whether libbpf loads # it successfully $(OUTPUT)/test_xdp.o: progs/test_xdp.c - $(CLANG) $(CLANG_FLAGS) \ - -O2 -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif $(OUTPUT)/%.o: progs/%.c - $(CLANG) $(CLANG_FLAGS) \ - -O2 -target bpf -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ -- cgit v1.2.3 From 748e50c1c13d33dffcdca66f0b1cb3d96d17fe31 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:27 +0200 Subject: selftests/bpf: compile progs with -D__TARGET_ARCH_$(SRCARCH) This opens up the possibility of accessing registers in an arch-independent way. Signed-off-by: Ilya Leoshkevich Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8312db1ae7c2..277d8605e340 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 include ../../../../scripts/Kbuild.include +include ../../../scripts/Makefile.arch LIBDIR := ../../../lib BPFDIR := $(LIBDIR)/bpf @@ -139,7 +140,8 @@ CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - &1 \ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ $(CLANG_SYS_INCLUDES) \ - -Wno-compare-distinct-pointer-types + -Wno-compare-distinct-pointer-types \ + -D__TARGET_ARCH_$(SRCARCH) $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline -- cgit v1.2.3 From 05c2dc17dae310f92a0c5979e227133b0e85f2fa Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:28 +0200 Subject: selftests/bpf: fix s930 -> s390 typo Also check for __s390__ instead of __s390x__, just in case bpf_helpers.h is ever used by 32-bit userspace. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 5d63c66bc9d7..f6e3768660be 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #if defined(__TARGET_ARCH_x86) #define bpf_target_x86 #define bpf_target_defined -#elif defined(__TARGET_ARCH_s930x) - #define bpf_target_s930x +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 #define bpf_target_defined #elif defined(__TARGET_ARCH_arm) #define bpf_target_arm @@ -341,8 +341,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #ifndef bpf_target_defined #if defined(__x86_64__) #define bpf_target_x86 -#elif defined(__s390x__) - #define bpf_target_s930x +#elif defined(__s390__) + #define bpf_target_s390 #elif defined(__arm__) #define bpf_target_arm #elif defined(__aarch64__) @@ -369,7 +369,7 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) -#elif defined(bpf_target_s390x) +#elif defined(bpf_target_s390) #define PT_REGS_PARM1(x) ((x)->gprs[2]) #define PT_REGS_PARM2(x) ((x)->gprs[3]) -- cgit v1.2.3 From 7cd04535abc95c50ba9bfa48efc76274b63c70f5 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:29 +0200 Subject: selftests/bpf: make PT_REGS_* work in userspace Right now, on certain architectures, these macros are usable only with kernel headers. This patch makes it possible to use them with userspace headers and, as a consequence, not only in BPF samples, but also in BPF selftests. On s390, provide the forward declaration of struct pt_regs and cast it to user_pt_regs in PT_REGS_* macros. This is necessary, because instead of the full struct pt_regs, s390 exposes only its first member user_pt_regs to userspace, and bpf_helpers.h is used with both userspace (in selftests) and kernel (in samples) headers. It was added in commit 466698e654e8 ("s390/bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type"). Ditto on arm64. On x86, provide userspace versions of PT_REGS_* macros. Unlike s390 and arm64, x86 provides struct pt_regs to both userspace and kernel, however, with different member names. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 75 ++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index f6e3768660be..f804f210244e 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -358,6 +358,7 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #if defined(bpf_target_x86) +#ifdef __KERNEL__ #define PT_REGS_PARM1(x) ((x)->di) #define PT_REGS_PARM2(x) ((x)->si) #define PT_REGS_PARM3(x) ((x)->dx) @@ -368,19 +369,49 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define PT_REGS_RC(x) ((x)->ax) #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) +#else +#ifdef __i386__ +/* i386 kernel is built with -mregparm=3 */ +#define PT_REGS_PARM1(x) ((x)->eax) +#define PT_REGS_PARM2(x) ((x)->edx) +#define PT_REGS_PARM3(x) ((x)->ecx) +#define PT_REGS_PARM4(x) 0 +#define PT_REGS_PARM5(x) 0 +#define PT_REGS_RET(x) ((x)->esp) +#define PT_REGS_FP(x) ((x)->ebp) +#define PT_REGS_RC(x) ((x)->eax) +#define PT_REGS_SP(x) ((x)->esp) +#define PT_REGS_IP(x) ((x)->eip) +#else +#define PT_REGS_PARM1(x) ((x)->rdi) +#define PT_REGS_PARM2(x) ((x)->rsi) +#define PT_REGS_PARM3(x) ((x)->rdx) +#define PT_REGS_PARM4(x) ((x)->rcx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->rsp) +#define PT_REGS_FP(x) ((x)->rbp) +#define PT_REGS_RC(x) ((x)->rax) +#define PT_REGS_SP(x) ((x)->rsp) +#define PT_REGS_IP(x) ((x)->rip) +#endif +#endif #elif defined(bpf_target_s390) -#define PT_REGS_PARM1(x) ((x)->gprs[2]) -#define PT_REGS_PARM2(x) ((x)->gprs[3]) -#define PT_REGS_PARM3(x) ((x)->gprs[4]) -#define PT_REGS_PARM4(x) ((x)->gprs[5]) -#define PT_REGS_PARM5(x) ((x)->gprs[6]) -#define PT_REGS_RET(x) ((x)->gprs[14]) -#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ -#define PT_REGS_RC(x) ((x)->gprs[2]) -#define PT_REGS_SP(x) ((x)->gprs[15]) -#define PT_REGS_IP(x) ((x)->psw.addr) +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_S390 const volatile user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_PARM2(x) (((PT_REGS_S390 *)(x))->gprs[3]) +#define PT_REGS_PARM3(x) (((PT_REGS_S390 *)(x))->gprs[4]) +#define PT_REGS_PARM4(x) (((PT_REGS_S390 *)(x))->gprs[5]) +#define PT_REGS_PARM5(x) (((PT_REGS_S390 *)(x))->gprs[6]) +#define PT_REGS_RET(x) (((PT_REGS_S390 *)(x))->gprs[14]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_S390 *)(x))->gprs[11]) +#define PT_REGS_RC(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15]) +#define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr) #elif defined(bpf_target_arm) @@ -397,16 +428,20 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #elif defined(bpf_target_arm64) -#define PT_REGS_PARM1(x) ((x)->regs[0]) -#define PT_REGS_PARM2(x) ((x)->regs[1]) -#define PT_REGS_PARM3(x) ((x)->regs[2]) -#define PT_REGS_PARM4(x) ((x)->regs[3]) -#define PT_REGS_PARM5(x) ((x)->regs[4]) -#define PT_REGS_RET(x) ((x)->regs[30]) -#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ -#define PT_REGS_RC(x) ((x)->regs[0]) -#define PT_REGS_SP(x) ((x)->sp) -#define PT_REGS_IP(x) ((x)->pc) +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_ARM64 const volatile struct user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_PARM2(x) (((PT_REGS_ARM64 *)(x))->regs[1]) +#define PT_REGS_PARM3(x) (((PT_REGS_ARM64 *)(x))->regs[2]) +#define PT_REGS_PARM4(x) (((PT_REGS_ARM64 *)(x))->regs[3]) +#define PT_REGS_PARM5(x) (((PT_REGS_ARM64 *)(x))->regs[4]) +#define PT_REGS_RET(x) (((PT_REGS_ARM64 *)(x))->regs[30]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_ARM64 *)(x))->regs[29]) +#define PT_REGS_RC(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp) +#define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc) #elif defined(bpf_target_mips) -- cgit v1.2.3 From af3c24e0e2ed25177b03b60ada9117bb596e8d95 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:30 +0200 Subject: selftests/bpf: fix compiling loop{1, 2, 3}.c on s390 Use PT_REGS_RC(ctx) instead of ctx->rax, which is not present on s390. Signed-off-by: Ilya Leoshkevich Reviewed-by: Stanislav Fomichev Tested-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/loop1.c | 2 +- tools/testing/selftests/bpf/progs/loop2.c | 2 +- tools/testing/selftests/bpf/progs/loop3.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index dea395af9ea9..7cdb7f878310 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -18,7 +18,7 @@ int nested_loops(volatile struct pt_regs* ctx) for (j = 0; j < 300; j++) for (i = 0; i < j; i++) { if (j & 1) - m = ctx->rax; + m = PT_REGS_RC(ctx); else m = j; sum += i * m; diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index 0637bd8e8bcf..9b2f808a2863 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -16,7 +16,7 @@ int while_true(volatile struct pt_regs* ctx) int i = 0; while (true) { - if (ctx->rax & 1) + if (PT_REGS_RC(ctx) & 1) i += 3; else i += 7; diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 30a0f6cba080..d727657d51e2 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -16,7 +16,7 @@ int while_true(volatile struct pt_regs* ctx) __u64 i = 0, sum = 0; do { i++; - sum += ctx->rax; + sum += PT_REGS_RC(ctx); } while (i < 0x100000000ULL); return sum; } -- cgit v1.2.3 From 6e67d77d673d785631b0c52314b60d3c68ebe809 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 12 Jul 2019 14:31:13 +0200 Subject: perf vendor events s390: Add JSON files for machine type 8561 Add CPU measurement counter facility event description files (JSON) for IBM machine types 8561 and 8562. Signed-off-by: Thomas Richter Reviewed-by: Vasily Gorbik Cc: Heiko Carstens Cc: Hendrik Brueckner Link: http://lkml.kernel.org/r/20190712123113.100882-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/s390/cf_m8561/basic.json | 58 ++++ .../perf/pmu-events/arch/s390/cf_m8561/crypto.json | 114 +++++++ .../pmu-events/arch/s390/cf_m8561/crypto6.json | 30 ++ .../pmu-events/arch/s390/cf_m8561/extended.json | 373 +++++++++++++++++++++ tools/perf/pmu-events/arch/s390/mapfile.csv | 1 + 5 files changed, 576 insertions(+) create mode 100644 tools/perf/pmu-events/arch/s390/cf_m8561/basic.json create mode 100644 tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json create mode 100644 tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json create mode 100644 tools/perf/pmu-events/arch/s390/cf_m8561/extended.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/basic.json b/tools/perf/pmu-events/arch/s390/cf_m8561/basic.json new file mode 100644 index 000000000000..17fb5241928b --- /dev/null +++ b/tools/perf/pmu-events/arch/s390/cf_m8561/basic.json @@ -0,0 +1,58 @@ +[ + { + "Unit": "CPU-M-CF", + "EventCode": "0", + "EventName": "CPU_CYCLES", + "BriefDescription": "CPU Cycles", + "PublicDescription": "Cycle Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "1", + "EventName": "INSTRUCTIONS", + "BriefDescription": "Instructions", + "PublicDescription": "Instruction Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "2", + "EventName": "L1I_DIR_WRITES", + "BriefDescription": "L1I Directory Writes", + "PublicDescription": "Level-1 I-Cache Directory Write Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "3", + "EventName": "L1I_PENALTY_CYCLES", + "BriefDescription": "L1I Penalty Cycles", + "PublicDescription": "Level-1 I-Cache Penalty Cycle Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "4", + "EventName": "L1D_DIR_WRITES", + "BriefDescription": "L1D Directory Writes", + "PublicDescription": "Level-1 D-Cache Directory Write Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "5", + "EventName": "L1D_PENALTY_CYCLES", + "BriefDescription": "L1D Penalty Cycles", + "PublicDescription": "Level-1 D-Cache Penalty Cycle Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "32", + "EventName": "PROBLEM_STATE_CPU_CYCLES", + "BriefDescription": "Problem-State CPU Cycles", + "PublicDescription": "Problem-State Cycle Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "33", + "EventName": "PROBLEM_STATE_INSTRUCTIONS", + "BriefDescription": "Problem-State Instructions", + "PublicDescription": "Problem-State Instruction Count" + }, +] diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json b/tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json new file mode 100644 index 000000000000..db286f19e7b6 --- /dev/null +++ b/tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json @@ -0,0 +1,114 @@ +[ + { + "Unit": "CPU-M-CF", + "EventCode": "64", + "EventName": "PRNG_FUNCTIONS", + "BriefDescription": "PRNG Functions", + "PublicDescription": "Total number of the PRNG functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "65", + "EventName": "PRNG_CYCLES", + "BriefDescription": "PRNG Cycles", + "PublicDescription": "Total number of CPU cycles when the DEA/AES coprocessor is busy performing PRNG functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "66", + "EventName": "PRNG_BLOCKED_FUNCTIONS", + "BriefDescription": "PRNG Blocked Functions", + "PublicDescription": "Total number of the PRNG functions that are issued by the CPU and are blocked because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "67", + "EventName": "PRNG_BLOCKED_CYCLES", + "BriefDescription": "PRNG Blocked Cycles", + "PublicDescription": "Total number of CPU cycles blocked for the PRNG functions issued by the CPU because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "68", + "EventName": "SHA_FUNCTIONS", + "BriefDescription": "SHA Functions", + "PublicDescription": "Total number of SHA functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "69", + "EventName": "SHA_CYCLES", + "BriefDescription": "SHA Cycles", + "PublicDescription": "Total number of CPU cycles when the SHA coprocessor is busy performing the SHA functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "70", + "EventName": "SHA_BLOCKED_FUNCTIONS", + "BriefDescription": "SHA Blocked Functions", + "PublicDescription": "Total number of the SHA functions that are issued by the CPU and are blocked because the SHA coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "71", + "EventName": "SHA_BLOCKED_CYCLES", + "BriefDescription": "SHA Bloced Cycles", + "PublicDescription": "Total number of CPU cycles blocked for the SHA functions issued by the CPU because the SHA coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "72", + "EventName": "DEA_FUNCTIONS", + "BriefDescription": "DEA Functions", + "PublicDescription": "Total number of the DEA functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "73", + "EventName": "DEA_CYCLES", + "BriefDescription": "DEA Cycles", + "PublicDescription": "Total number of CPU cycles when the DEA/AES coprocessor is busy performing the DEA functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "74", + "EventName": "DEA_BLOCKED_FUNCTIONS", + "BriefDescription": "DEA Blocked Functions", + "PublicDescription": "Total number of the DEA functions that are issued by the CPU and are blocked because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "75", + "EventName": "DEA_BLOCKED_CYCLES", + "BriefDescription": "DEA Blocked Cycles", + "PublicDescription": "Total number of CPU cycles blocked for the DEA functions issued by the CPU because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "76", + "EventName": "AES_FUNCTIONS", + "BriefDescription": "AES Functions", + "PublicDescription": "Total number of AES functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "77", + "EventName": "AES_CYCLES", + "BriefDescription": "AES Cycles", + "PublicDescription": "Total number of CPU cycles when the DEA/AES coprocessor is busy performing the AES functions issued by the CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "78", + "EventName": "AES_BLOCKED_FUNCTIONS", + "BriefDescription": "AES Blocked Functions", + "PublicDescription": "Total number of AES functions that are issued by the CPU and are blocked because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "79", + "EventName": "AES_BLOCKED_CYCLES", + "BriefDescription": "AES Blocked Cycles", + "PublicDescription": "Total number of CPU cycles blocked for the AES functions issued by the CPU because the DEA/AES coprocessor is busy performing a function issued by another CPU" + }, +] diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json new file mode 100644 index 000000000000..5e36bc2468d0 --- /dev/null +++ b/tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json @@ -0,0 +1,30 @@ +[ + { + "Unit": "CPU-M-CF", + "EventCode": "80", + "EventName": "ECC_FUNCTION_COUNT", + "BriefDescription": "ECC Function Count", + "PublicDescription": "Long ECC function Count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "81", + "EventName": "ECC_CYCLES_COUNT", + "BriefDescription": "ECC Cycles Count", + "PublicDescription": "Long ECC Function cycles count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "82", + "EventName": "ECC_BLOCKED_FUNCTION_COUNT", + "BriefDescription": "Ecc Blocked Function Count", + "PublicDescription": "Long ECC blocked function count" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "83", + "EventName": "ECC_BLOCKED_CYCLES_COUNT", + "BriefDescription": "ECC Blocked Cycles Count", + "PublicDescription": "Long ECC blocked cycles count" + }, +] diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/extended.json b/tools/perf/pmu-events/arch/s390/cf_m8561/extended.json new file mode 100644 index 000000000000..89e070727e1b --- /dev/null +++ b/tools/perf/pmu-events/arch/s390/cf_m8561/extended.json @@ -0,0 +1,373 @@ +[ + { + "Unit": "CPU-M-CF", + "EventCode": "128", + "EventName": "L1D_RO_EXCL_WRITES", + "BriefDescription": "L1D Read-only Exclusive Writes", + "PublicDescription": "A directory write to the Level-1 Data cache where the line was originally in a Read-Only state in the cache but has been updated to be in the Exclusive state that allows stores to the cache line" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "129", + "EventName": "DTLB2_WRITES", + "BriefDescription": "DTLB2 Writes", + "PublicDescription": "A translation has been written into The Translation Lookaside Buffer 2 (TLB2) and the request was made by the data cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "130", + "EventName": "DTLB2_MISSES", + "BriefDescription": "DTLB2 Misses", + "PublicDescription": "A TLB2 miss is in progress for a request made by the data cache. Incremented by one for every TLB2 miss in progress for the Level-1 Data cache on this cycle" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "131", + "EventName": "DTLB2_HPAGE_WRITES", + "BriefDescription": "DTLB2 One-Megabyte Page Writes", + "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page or a Last Host Translation was done" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "132", + "EventName": "DTLB2_GPAGE_WRITES", + "BriefDescription": "DTLB2 Two-Gigabyte Page Writes", + "PublicDescription": "A translation entry for a two-gigabyte page was written into the Level-2 TLB" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "133", + "EventName": "L1D_L2D_SOURCED_WRITES", + "BriefDescription": "L1D L2D Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from the Level-2 Data cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "134", + "EventName": "ITLB2_WRITES", + "BriefDescription": "ITLB2 Writes", + "PublicDescription": "A translation entry has been written into the Translation Lookaside Buffer 2 (TLB2) and the request was made by the instruction cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "135", + "EventName": "ITLB2_MISSES", + "BriefDescription": "ITLB2 Misses", + "PublicDescription": "A TLB2 miss is in progress for a request made by the instruction cache. Incremented by one for every TLB2 miss in progress for the Level-1 Instruction cache in a cycle" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "136", + "EventName": "L1I_L2I_SOURCED_WRITES", + "BriefDescription": "L1I L2I Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from the Level-2 Instruction cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "137", + "EventName": "TLB2_PTE_WRITES", + "BriefDescription": "TLB2 PTE Writes", + "PublicDescription": "A translation entry was written into the Page Table Entry array in the Level-2 TLB" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "138", + "EventName": "TLB2_CRSTE_WRITES", + "BriefDescription": "TLB2 CRSTE Writes", + "PublicDescription": "Translation entries were written into the Combined Region and Segment Table Entry array and the Page Table Entry array in the Level-2 TLB" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "139", + "EventName": "TLB2_ENGINES_BUSY", + "BriefDescription": "TLB2 Engines Busy", + "PublicDescription": "The number of Level-2 TLB translation engines busy in a cycle" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "140", + "EventName": "TX_C_TEND", + "BriefDescription": "Completed TEND instructions in constrained TX mode", + "PublicDescription": "A TEND instruction has completed in a constrained transactional-execution mode" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "141", + "EventName": "TX_NC_TEND", + "BriefDescription": "Completed TEND instructions in non-constrained TX mode", + "PublicDescription": "A TEND instruction has completed in a non-constrained transactional-execution mode" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "143", + "EventName": "L1C_TLB2_MISSES", + "BriefDescription": "L1C TLB2 Misses", + "PublicDescription": "Increments by one for any cycle where a level-1 cache or level-2 TLB miss is in progress" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "144", + "EventName": "L1D_ONCHIP_L3_SOURCED_WRITES", + "BriefDescription": "L1D On-Chip L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "145", + "EventName": "L1D_ONCHIP_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1D On-Chip Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from On-Chip memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "146", + "EventName": "L1D_ONCHIP_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1D On-Chip L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "147", + "EventName": "L1D_ONCLUSTER_L3_SOURCED_WRITES", + "BriefDescription": "L1D On-Cluster L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from On-Cluster Level-3 cache withountervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "148", + "EventName": "L1D_ONCLUSTER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1D On-Cluster Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Cluster memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "149", + "EventName": "L1D_ONCLUSTER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1D On-Cluster L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Cluster Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "150", + "EventName": "L1D_OFFCLUSTER_L3_SOURCED_WRITES", + "BriefDescription": "L1D Off-Cluster L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an Off-Cluster Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "151", + "EventName": "L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1D Off-Cluster Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from Off-Cluster memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "152", + "EventName": "L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1D Off-Cluster L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an Off-Cluster Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "153", + "EventName": "L1D_OFFDRAWER_L3_SOURCED_WRITES", + "BriefDescription": "L1D Off-Drawer L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an Off-Drawer Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "154", + "EventName": "L1D_OFFDRAWER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1D Off-Drawer Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from Off-Drawer memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "155", + "EventName": "L1D_OFFDRAWER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1D Off-Drawer L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an Off-Drawer Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "156", + "EventName": "L1D_ONDRAWER_L4_SOURCED_WRITES", + "BriefDescription": "L1D On-Drawer L4 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from On-Drawer Level-4 cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "157", + "EventName": "L1D_OFFDRAWER_L4_SOURCED_WRITES", + "BriefDescription": "L1D Off-Drawer L4 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from Off-Drawer Level-4 cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "158", + "EventName": "L1D_ONCHIP_L3_SOURCED_WRITES_RO", + "BriefDescription": "L1D On-Chip L3 Sourced Writes read-only", + "PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from On-Chip L3 but a read-only invalidate was done to remove other copies of the cache line" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "162", + "EventName": "L1I_ONCHIP_L3_SOURCED_WRITES", + "BriefDescription": "L1I On-Chip L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache ine was sourced from an On-Chip Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "163", + "EventName": "L1I_ONCHIP_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1I On-Chip Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache ine was sourced from On-Chip memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "164", + "EventName": "L1I_ONCHIP_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1I On-Chip L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache ine was sourced from an On-Chip Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "165", + "EventName": "L1I_ONCLUSTER_L3_SOURCED_WRITES", + "BriefDescription": "L1I On-Cluster L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Cluster Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "166", + "EventName": "L1I_ONCLUSTER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1I On-Cluster Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Cluster memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "167", + "EventName": "L1I_ONCLUSTER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1I On-Cluster L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from On-Cluster Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "168", + "EventName": "L1I_OFFCLUSTER_L3_SOURCED_WRITES", + "BriefDescription": "L1I Off-Cluster L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an Off-Cluster Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "169", + "EventName": "L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1I Off-Cluster Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from Off-Cluster memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "170", + "EventName": "L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1I Off-Cluster L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an Off-Cluster Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "171", + "EventName": "L1I_OFFDRAWER_L3_SOURCED_WRITES", + "BriefDescription": "L1I Off-Drawer L3 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an Off-Drawer Level-3 cache without intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "172", + "EventName": "L1I_OFFDRAWER_MEMORY_SOURCED_WRITES", + "BriefDescription": "L1I Off-Drawer Memory Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from Off-Drawer memory" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "173", + "EventName": "L1I_OFFDRAWER_L3_SOURCED_WRITES_IV", + "BriefDescription": "L1I Off-Drawer L3 Sourced Writes with Intervention", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an Off-Drawer Level-3 cache with intervention" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "174", + "EventName": "L1I_ONDRAWER_L4_SOURCED_WRITES", + "BriefDescription": "L1I On-Drawer L4 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from On-Drawer Level-4 cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "175", + "EventName": "L1I_OFFDRAWER_L4_SOURCED_WRITES", + "BriefDescription": "L1I Off-Drawer L4 Sourced Writes", + "PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from Off-Drawer Level-4 cache" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "224", + "EventName": "BCD_DFP_EXECUTION_SLOTS", + "BriefDescription": "BCD DFP Execution Slots", + "PublicDescription": "Count of floating point execution slots used for finished Binary Coded Decimal to Decimal Floating Point conversions. Instructions: CDZT, CXZT, CZDT, CZXT" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "225", + "EventName": "VX_BCD_EXECUTION_SLOTS", + "BriefDescription": "VX BCD Execution Slots", + "PublicDescription": "Count of floating point execution slots used for finished vector arithmetic Binary Coded Decimal instructions. Instructions: VAP, VSP, VMPVMSP, VDP, VSDP, VRP, VLIP, VSRP, VPSOPVCP, VTP, VPKZ, VUPKZ, VCVB, VCVBG, VCVDVCVDG" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "226", + "EventName": "DECIMAL_INSTRUCTIONS", + "BriefDescription": "Decimal Instructions", + "PublicDescription": "Decimal instructions dispatched. Instructions: CVB, CVD, AP, CP, DP, ED, EDMK, MP, SRP, SP, ZAP" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "232", + "EventName": "LAST_HOST_TRANSLATIONS", + "BriefDescription": "Last host translation done", + "PublicDescription": "Last Host Translation done" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "243", + "EventName": "TX_NC_TABORT", + "BriefDescription": "Aborted transactions in non-constrained TX mode", + "PublicDescription": "A transaction abort has occurred in a non-constrained transactional-execution mode" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "244", + "EventName": "TX_C_TABORT_NO_SPECIAL", + "BriefDescription": "Aborted transactions in constrained TX mode not using special completion logic", + "PublicDescription": "A transaction abort has occurred in a constrained transactional-execution mode and the CPU is not using any special logic to allow the transaction to complete" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "245", + "EventName": "TX_C_TABORT_SPECIAL", + "BriefDescription": "Aborted transactions in constrained TX mode using special completion logic", + "PublicDescription": "A transaction abort has occurred in a constrained transactional-execution mode and the CPU is using special logic to allow the transaction to complete" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "448", + "EventName": "MT_DIAG_CYCLES_ONE_THR_ACTIVE", + "BriefDescription": "Cycle count with one thread active", + "PublicDescription": "Cycle count with one thread active" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "449", + "EventName": "MT_DIAG_CYCLES_TWO_THR_ACTIVE", + "BriefDescription": "Cycle count with two threads active", + "PublicDescription": "Cycle count with two threads active" + }, +] diff --git a/tools/perf/pmu-events/arch/s390/mapfile.csv b/tools/perf/pmu-events/arch/s390/mapfile.csv index 78bcf7f8e206..bd3fc577139c 100644 --- a/tools/perf/pmu-events/arch/s390/mapfile.csv +++ b/tools/perf/pmu-events/arch/s390/mapfile.csv @@ -4,3 +4,4 @@ Family-model,Version,Filename,EventType ^IBM.282[78].*[13]\.[1-5].[[:xdigit:]]+$,1,cf_zec12,core ^IBM.296[45].*[13]\.[1-5].[[:xdigit:]]+$,1,cf_z13,core ^IBM.390[67].*[13]\.[1-5].[[:xdigit:]]+$,3,cf_z14,core +^IBM.856[12].*3\.6.[[:xdigit:]]+$,3,cf_m8561,core -- cgit v1.2.3 From 100c4043b808739d808bb5c9f6868d3808f062ea Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 11 Jul 2019 12:29:00 -0400 Subject: tc-tests: updated skbedit tests - Added mask upper bound test case - Added mask validation test case - Added mask replacement case Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/skbedit.json | 117 +++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json index 45e7e89928a5..bf5ebf59c2d4 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json @@ -69,6 +69,123 @@ "matchCount": "0", "teardown": [] }, + { + "id": "d4cd", + "name": "Add skbedit action with valid mark and mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabb", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, + { + "id": "baa7", + "name": "Add skbedit action with valid mark and 32-bit maximum mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xffffffff", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xffffffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, + { + "id": "62a5", + "name": "Add skbedit action with valid mark and mask exceeding 32-bit maximum", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabbccddeeff112233", + "expExitCode": "255", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabbccddeeff112233", + "matchCount": "0", + "teardown": [] + }, + { + "id": "bc15", + "name": "Add skbedit action with valid mark and mask with invalid format", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/-1234", + "expExitCode": "255", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/-1234", + "matchCount": "0", + "teardown": [] + }, + { + "id": "57c2", + "name": "Replace skbedit action with new mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ], + "$TC actions add action skbedit mark 1/0x11223344 index 1" + ], + "cmdUnderTest": "$TC actions replace action skbedit mark 1/0xaabb index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, { "id": "081d", "name": "Add skbedit action with priority", -- cgit v1.2.3 From 916c31fff946fae0e05862f9b2435fdb29fd5090 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 11 Jun 2019 08:31:09 +0530 Subject: perf version: Fix segfault due to missing OPT_END() 'perf version' on powerpc segfaults when used with non-supported option: # perf version -a Segmentation fault (core dumped) Fix this. Signed-off-by: Ravi Bangoria Reviewed-by: Kamalesh Babulal Tested-by: Mamatha Inamdar Cc: Jiri Olsa Cc: Kamalesh Babulal Link: http://lkml.kernel.org/r/20190611030109.20228-1-ravi.bangoria@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-version.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c index f470144d1a70..bf114ca9ca87 100644 --- a/tools/perf/builtin-version.c +++ b/tools/perf/builtin-version.c @@ -19,6 +19,7 @@ static struct version version; static struct option version_options[] = { OPT_BOOLEAN(0, "build-options", &version.build_options, "display the build options"), + OPT_END(), }; static const char * const version_usage[] = { -- cgit v1.2.3 From 39443104c7d3f2b05a4a330fbcef6da68f80d60b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Apr 2019 17:29:24 -0300 Subject: docs: blockdev: convert to ReST Rename the blockdev documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. The drbd sub-directory contains some graphs and data flows. Add those too to the documentation. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/kernel-parameters.txt | 18 +- Documentation/blockdev/drbd/README.txt | 16 - Documentation/blockdev/drbd/data-structure-v9.rst | 42 +++ Documentation/blockdev/drbd/data-structure-v9.txt | 38 -- Documentation/blockdev/drbd/figures.rst | 28 ++ Documentation/blockdev/drbd/index.rst | 19 + Documentation/blockdev/floppy.rst | 255 +++++++++++++ Documentation/blockdev/floppy.txt | 245 ------------ Documentation/blockdev/index.rst | 16 + Documentation/blockdev/nbd.rst | 31 ++ Documentation/blockdev/nbd.txt | 31 -- Documentation/blockdev/paride.rst | 439 ++++++++++++++++++++++ Documentation/blockdev/paride.txt | 417 -------------------- Documentation/blockdev/ramdisk.rst | 177 +++++++++ Documentation/blockdev/ramdisk.txt | 174 --------- Documentation/blockdev/zram.rst | 422 +++++++++++++++++++++ Documentation/blockdev/zram.txt | 355 ----------------- MAINTAINERS | 8 +- drivers/block/Kconfig | 8 +- drivers/block/floppy.c | 2 +- drivers/block/zram/Kconfig | 6 +- tools/testing/selftests/zram/README | 2 +- 22 files changed, 1451 insertions(+), 1298 deletions(-) delete mode 100644 Documentation/blockdev/drbd/README.txt create mode 100644 Documentation/blockdev/drbd/data-structure-v9.rst delete mode 100644 Documentation/blockdev/drbd/data-structure-v9.txt create mode 100644 Documentation/blockdev/drbd/figures.rst create mode 100644 Documentation/blockdev/drbd/index.rst create mode 100644 Documentation/blockdev/floppy.rst delete mode 100644 Documentation/blockdev/floppy.txt create mode 100644 Documentation/blockdev/index.rst create mode 100644 Documentation/blockdev/nbd.rst delete mode 100644 Documentation/blockdev/nbd.txt create mode 100644 Documentation/blockdev/paride.rst delete mode 100644 Documentation/blockdev/paride.txt create mode 100644 Documentation/blockdev/ramdisk.rst delete mode 100644 Documentation/blockdev/ramdisk.txt create mode 100644 Documentation/blockdev/zram.rst delete mode 100644 Documentation/blockdev/zram.txt (limited to 'tools') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a342dd5c95a9..6b2adda1cc03 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1249,7 +1249,7 @@ See also Documentation/fault-injection/. floppy= [HW] - See Documentation/blockdev/floppy.txt. + See Documentation/blockdev/floppy.rst. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -2234,7 +2234,7 @@ memblock=debug [KNL] Enable memblock debug messages. load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -3268,7 +3268,7 @@ pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/blockdev/paride.txt. + See also Documentation/blockdev/paride.rst. pci=option[,option...] [PCI] various PCI subsystem options. @@ -3512,7 +3512,7 @@ needed on a platform with proper driver support. pd. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -3527,10 +3527,10 @@ and performance comparison. pf. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pg. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.rst. @@ -3642,7 +3642,7 @@ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. psi= [KNL] Enable or disable pressure stall information tracking. @@ -3664,7 +3664,7 @@ pstore.backend= Specify the name of the pstore backend to use pt. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pti= [X86_64] Control Page Table Isolation of user and kernel address spaces. Disabling this feature @@ -3693,7 +3693,7 @@ See Documentation/admin-guide/md.rst. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. random.trust_cpu={on,off} [KNL] Enable or disable trusting the use of the diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt deleted file mode 100644 index 627b0a1bf35e..000000000000 --- a/Documentation/blockdev/drbd/README.txt +++ /dev/null @@ -1,16 +0,0 @@ -Description - - DRBD is a shared-nothing, synchronously replicated block device. It - is designed to serve as a building block for high availability - clusters and in this context, is a "drop-in" replacement for shared - storage. Simplistically, you could see it as a network RAID 1. - - Please visit http://www.drbd.org to find out more. - -The here included files are intended to help understand the implementation - -DRBD-8.3-data-packets.svg, DRBD-data-packets.svg - relates some functions, and write packets. - -conn-states-8.dot, disk-states-8.dot, node-states-8.dot - The sub graphs of DRBD's state transitions diff --git a/Documentation/blockdev/drbd/data-structure-v9.rst b/Documentation/blockdev/drbd/data-structure-v9.rst new file mode 100644 index 000000000000..66036b901644 --- /dev/null +++ b/Documentation/blockdev/drbd/data-structure-v9.rst @@ -0,0 +1,42 @@ +================================ +kernel data structure for DRBD-9 +================================ + +This describes the in kernel data structure for DRBD-9. Starting with +Linux v3.14 we are reorganizing DRBD to use this data structure. + +Basic Data Structure +==================== + +A node has a number of DRBD resources. Each such resource has a number of +devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD +device is represented by a block device locally. + +The DRBD objects are interconnected to form a matrix as depicted below; a +drbd_peer_device object sits at each intersection between a drbd_device and a +drbd_connection:: + + /--------------+---------------+.....+---------------\ + | resource | device | | device | + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + +--------------+---------------+.....+---------------+ + : : : : : + : : : : : + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + \--------------+---------------+.....+---------------/ + +In this table, horizontally, devices can be accessed from resources by their +volume number. Likewise, peer_devices can be accessed from connections by +their volume number. Objects in the vertical direction are connected by double +linked lists. There are back pointers from peer_devices to their connections a +devices, and from connections and devices to their resource. + +All resources are in the drbd_resources double-linked list. In addition, all +devices can be accessed by their minor device number via the drbd_devices idr. + +The drbd_resource, drbd_connection, and drbd_device objects are reference +counted. The peer_device objects only serve to establish the links between +devices and connections; their lifetime is determined by the lifetime of the +device and connection which they reference. diff --git a/Documentation/blockdev/drbd/data-structure-v9.txt b/Documentation/blockdev/drbd/data-structure-v9.txt deleted file mode 100644 index 1e52a0e32624..000000000000 --- a/Documentation/blockdev/drbd/data-structure-v9.txt +++ /dev/null @@ -1,38 +0,0 @@ -This describes the in kernel data structure for DRBD-9. Starting with -Linux v3.14 we are reorganizing DRBD to use this data structure. - -Basic Data Structure -==================== - -A node has a number of DRBD resources. Each such resource has a number of -devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD -device is represented by a block device locally. - -The DRBD objects are interconnected to form a matrix as depicted below; a -drbd_peer_device object sits at each intersection between a drbd_device and a -drbd_connection: - - /--------------+---------------+.....+---------------\ - | resource | device | | device | - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - +--------------+---------------+.....+---------------+ - : : : : : - : : : : : - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - \--------------+---------------+.....+---------------/ - -In this table, horizontally, devices can be accessed from resources by their -volume number. Likewise, peer_devices can be accessed from connections by -their volume number. Objects in the vertical direction are connected by double -linked lists. There are back pointers from peer_devices to their connections a -devices, and from connections and devices to their resource. - -All resources are in the drbd_resources double-linked list. In addition, all -devices can be accessed by their minor device number via the drbd_devices idr. - -The drbd_resource, drbd_connection, and drbd_device objects are reference -counted. The peer_device objects only serve to establish the links between -devices and connections; their lifetime is determined by the lifetime of the -device and connection which they reference. diff --git a/Documentation/blockdev/drbd/figures.rst b/Documentation/blockdev/drbd/figures.rst new file mode 100644 index 000000000000..3e3fd4b8a478 --- /dev/null +++ b/Documentation/blockdev/drbd/figures.rst @@ -0,0 +1,28 @@ +.. The here included files are intended to help understand the implementation + +Data flows that Relate some functions, and write packets +======================================================== + +.. kernel-figure:: DRBD-8.3-data-packets.svg + :alt: DRBD-8.3-data-packets.svg + :align: center + +.. kernel-figure:: DRBD-data-packets.svg + :alt: DRBD-data-packets.svg + :align: center + + +Sub graphs of DRBD's state transitions +====================================== + +.. kernel-figure:: conn-states-8.dot + :alt: conn-states-8.dot + :align: center + +.. kernel-figure:: disk-states-8.dot + :alt: disk-states-8.dot + :align: center + +.. kernel-figure:: node-states-8.dot + :alt: node-states-8.dot + :align: center diff --git a/Documentation/blockdev/drbd/index.rst b/Documentation/blockdev/drbd/index.rst new file mode 100644 index 000000000000..68ecd5c113e9 --- /dev/null +++ b/Documentation/blockdev/drbd/index.rst @@ -0,0 +1,19 @@ +========================================== +Distributed Replicated Block Device - DRBD +========================================== + +Description +=========== + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +.. toctree:: + :maxdepth: 1 + + data-structure-v9 + figures diff --git a/Documentation/blockdev/floppy.rst b/Documentation/blockdev/floppy.rst new file mode 100644 index 000000000000..4a8f31cf4139 --- /dev/null +++ b/Documentation/blockdev/floppy.rst @@ -0,0 +1,255 @@ +============= +Floppy Driver +============= + +FAQ list: +========= + +A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + +The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + +Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad):: + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9:: + + append = "floppy=thinkpad" + +Several floppy related options may be given, example:: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + +If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + +If you use the floppy driver as a module, use the following syntax:: + + modprobe floppy floppy="" + +Example:: + + modprobe floppy floppy="omnibook messages" + +If you need certain options enabled every time you load the floppy driver, +you can put:: + + options floppy floppy="omnibook messages" + +in a configuration file in /etc/modprobe.d/. + + +The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc / floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook / floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + == ================================== + 0 Use the value of the physical CMOS + 1 5 1/4 DD + 2 5 1/4 HD + 3 3 1/2 DD + 4 3 1/2 HD + 5 3 1/2 ED + 6 3 1/2 ED + 16 unknown or not installed + == ================================== + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts / floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate:: + + PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases. + + +Supporting utilities and additional documentation: +================================================== + +Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + + http://fdutils.linux.lu + +The fdutils releases can be found at: + + http://fdutils.linux.lu/download.html + + http://www.tux.org/pub/knaff/fdutils/ + + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + +If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + +Be sure to read the FAQ before mailing/posting any bug reports! + +Alain + +Changelog +========= + +10-30-2004 : + Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : + Original Document diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt deleted file mode 100644 index e2240f5ab64d..000000000000 --- a/Documentation/blockdev/floppy.txt +++ /dev/null @@ -1,245 +0,0 @@ -This file describes the floppy driver. - -FAQ list: -========= - - A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - - The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - - Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad): - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9: - - append = "floppy=thinkpad" - - Several floppy related options may be given, example: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - - If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - - If you use the floppy driver as a module, use the following syntax: -modprobe floppy floppy="" - -Example: - modprobe floppy floppy="omnibook messages" - - If you need certain options enabled every time you load the floppy driver, -you can put: - - options floppy floppy="omnibook messages" - -in a configuration file in /etc/modprobe.d/. - - - The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc - floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook - floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - 0 - Use the value of the physical CMOS - 1 - 5 1/4 DD - 2 - 5 1/4 HD - 3 - 3 1/2 DD - 4 - 3 1/2 HD - 5 - 3 1/2 ED - 6 - 3 1/2 ED - 16 - unknown or not installed - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts - floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate: - " PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases." - - -Supporting utilities and additional documentation: -================================================== - - Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - http://fdutils.linux.lu - -The fdutils releases can be found at: - http://fdutils.linux.lu/download.html - http://www.tux.org/pub/knaff/fdutils/ - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - - If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - - Be sure to read the FAQ before mailing/posting any bug reports! - - Alain - -Changelog -========= - -10-30-2004 : Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : Original Document diff --git a/Documentation/blockdev/index.rst b/Documentation/blockdev/index.rst new file mode 100644 index 000000000000..a9af6ed8b4aa --- /dev/null +++ b/Documentation/blockdev/index.rst @@ -0,0 +1,16 @@ +:orphan: + +=========================== +The Linux RapidIO Subsystem +=========================== + +.. toctree:: + :maxdepth: 1 + + floppy + nbd + paride + ramdisk + zram + + drbd/index diff --git a/Documentation/blockdev/nbd.rst b/Documentation/blockdev/nbd.rst new file mode 100644 index 000000000000..d78dfe559dcf --- /dev/null +++ b/Documentation/blockdev/nbd.rst @@ -0,0 +1,31 @@ +================================== +Network Block Device (TCP version) +================================== + +1) Overview +----------- + +What is it: With this compiled in the kernel (or as a module), Linux +can use a remote server as one of its block devices. So every time +the client computer wants to read, e.g., /dev/nb0, it sends a +request over TCP to the server, which will reply with the data read. +This can be used for stations with low disk space (or even diskless) +to borrow disk space from another computer. +Unlike NFS, it is possible to put any filesystem on it, etc. + +For more information, or to download the nbd-client and nbd-server +tools, go to http://nbd.sf.net/. + +The nbd kernel module need only be installed on the client +system, as the nbd-server is completely in userspace. In fact, +the nbd-server has been successfully ported to other operating +systems, including Windows. + +A) NBD parameters +----------------- + +max_part + Number of partitions per device (default: 0). + +nbds_max + Number of block devices that should be initialized (default: 16). diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt deleted file mode 100644 index db242ea2bce8..000000000000 --- a/Documentation/blockdev/nbd.txt +++ /dev/null @@ -1,31 +0,0 @@ -Network Block Device (TCP version) -================================== - -1) Overview ------------ - -What is it: With this compiled in the kernel (or as a module), Linux -can use a remote server as one of its block devices. So every time -the client computer wants to read, e.g., /dev/nb0, it sends a -request over TCP to the server, which will reply with the data read. -This can be used for stations with low disk space (or even diskless) -to borrow disk space from another computer. -Unlike NFS, it is possible to put any filesystem on it, etc. - -For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. - -The nbd kernel module need only be installed on the client -system, as the nbd-server is completely in userspace. In fact, -the nbd-server has been successfully ported to other operating -systems, including Windows. - -A) NBD parameters ------------------ - -max_part - Number of partitions per device (default: 0). - -nbds_max - Number of block devices that should be initialized (default: 16). - diff --git a/Documentation/blockdev/paride.rst b/Documentation/blockdev/paride.rst new file mode 100644 index 000000000000..87b4278bf314 --- /dev/null +++ b/Documentation/blockdev/paride.rst @@ -0,0 +1,439 @@ +=================================== +Linux and parallel port IDE devices +=================================== + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction +=============== + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + - MicroSolutions backpack CD-ROM + - MicroSolutions backpack PD/CD + - MicroSolutions backpack hard-drives + - MicroSolutions backpack 8000t tape drive + - SyQuest EZ-135, EZ-230 & SparQ drives + - Avatar Shark + - Imation Superdisk LS-120 + - Maxell Superdisk LS-120 + - FreeCom Power CD + - Hewlett-Packard 5GB and 8GB tape drives + - Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + === ============= + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + === ============= + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + ==== ====================================== ==== + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + ==== ====================================== ==== + + +2. Using the PARIDE subsystem +============================= + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + ================ ============ ====== ======== + Manufacturer Model Driver Protocol + ================ ============ ====== ======== + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + ================ ============ ====== ======== + +2.1 Configuring built-in drivers +--------------------------------- + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like:: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command:: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules +---------------------------------------------- + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: + using these drivers with the "kerneld" automatic module loading + system is not recommended for beginners, and is not documented here. + +Note 2: + if you build PARPORT support as a loadable module, PARIDE must + also be built as loadable modules, and PARPORT must be loaded before + the PARIDE modules. + +To use PARIDE, you must begin by:: + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example:: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command:: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378:: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device +-------------------------- + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute:: + + #!/bin/bash + # + # mkd -- a script to create the device special files for the PARIDE subsystem + # + function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 + } + # + function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done + } + # + cd /dev + # + for u in 0 1 2 3 ; do pd $u ; done + for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done + for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done + for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done + for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done + for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done + # + # end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like:: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system:: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver +------------------ + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver +------------------------ + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver +------------------------ + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting +================== + +3.1 Use EPP mode if you can +---------------------------- + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay +------------------------- + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset +------------------------------------- + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing:: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help +------------------------------------------------------ + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in:: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help +--------------------------------- + +You can join the linux-parport mailing list by sending a mail message +to: + + linux-parport-request@torque.net + +with the single word:: + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt deleted file mode 100644 index ee6717e3771d..000000000000 --- a/Documentation/blockdev/paride.txt +++ /dev/null @@ -1,417 +0,0 @@ - - Linux and parallel port IDE devices - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - MicroSolutions backpack CD-ROM - MicroSolutions backpack PD/CD - MicroSolutions backpack hard-drives - MicroSolutions backpack 8000t tape drive - SyQuest EZ-135, EZ-230 & SparQ drives - Avatar Shark - Imation Superdisk LS-120 - Maxell Superdisk LS-120 - FreeCom Power CD - Hewlett-Packard 5GB and 8GB tape drives - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - - -2. Using the PARIDE subsystem - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - Manufacturer Model Driver Protocol - - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - -2.1 Configuring built-in drivers - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: using these drivers with the "kerneld" automatic module loading -system is not recommended for beginners, and is not documented here. - -Note 2: if you build PARPORT support as a loadable module, PARIDE must -also be built as loadable modules, and PARPORT must be loaded before the -PARIDE modules. - -To use PARIDE, you must begin by - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute: - -#!/bin/bash -# -# mkd -- a script to create the device special files for the PARIDE subsystem -# -function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 -} -# -function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done -} -# -cd /dev -# -for u in 0 1 2 3 ; do pd $u ; done -for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done -for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done -for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done -for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done -for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done -# -# end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting - -3.1 Use EPP mode if you can - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help - -You can join the linux-parport mailing list by sending a mail message -to - linux-parport-request@torque.net - -with the single word - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://web.archive.org/web/*/http://www.torque.net/parport/ - - diff --git a/Documentation/blockdev/ramdisk.rst b/Documentation/blockdev/ramdisk.rst new file mode 100644 index 000000000000..b7c2268f8dec --- /dev/null +++ b/Documentation/blockdev/ramdisk.rst @@ -0,0 +1,177 @@ +========================================== +Using the RAM disk block device with Linux +========================================== + +.. Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Parameters +--------------------------------- + +2a) Kernel Command Line Parameters + + ramdisk_size=N + Size of the ramdisk. + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB). + +2b) Module parameters + + rd_nr + /dev/ramX devices created. + + max_part + Maximum partition number. + + rd_size + See ramdisk_size. + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below:: + + ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF + ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 + ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do:: + + /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use:: + + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" + +Since the default start = 0 and the default prompt = 1, you could use:: + + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +----------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create:: + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example:: + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing:: + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy:: + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB):: + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552:: + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + + + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : + Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : + Original Document diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt deleted file mode 100644 index 501e12e0323e..000000000000 --- a/Documentation/blockdev/ramdisk.txt +++ /dev/null @@ -1,174 +0,0 @@ -Using the RAM disk block device with Linux ------------------------------------------- - -Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Parameters ---------------------------------- - -2a) Kernel Command Line Parameters - - ramdisk_size=N - ============== - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB). - -2b) Module parameters - - rd_nr - ===== - /dev/ramX devices created. - - max_part - ======== - Maximum partition number. - - rd_size - ======= - See ramdisk_size. - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below: - -./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF -./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 -./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do: - - /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use: - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" -Since the default start = 0 and the default prompt = 1, you could use: - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ----------------------------------------------- - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create. - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example. - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing. - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB). - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552. - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - --------------------------------------------------------------------------- - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : Original Document diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst new file mode 100644 index 000000000000..2111231c9c0f --- /dev/null +++ b/Documentation/blockdev/zram.rst @@ -0,0 +1,422 @@ +======================================== +zram: Compressed RAM based block devices +======================================== + +Introduction +============ + +The zram module creates RAM based block devices named /dev/zram +( = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram/ + +Usage +===== + +There are several ways to configure and manage zram device(-s): + +a) using zram and zram_control sysfs attributes +b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). + +In this document we will describe only 'manual' zram configuration steps, +IOW, zram and zram_control sysfs attributes. + +In order to get a better idea about zramctl please consult util-linux +documentation, zramctl man-page or `zramctl --help`. Please be informed +that zram maintainers do not develop/maintain util-linux or zramctl, should +you have any questions please contact util-linux@vger.kernel.org + +Following shows a typical sequence of steps for using zram. + +WARNING +======= + +For the sake of simplicity we skip error checking parts in most of the +examples below. However, it is your sole responsibility to handle errors. + +zram sysfs attributes always return negative values in case of errors. +The list of possible return codes: + +======== ============================================================= +-EBUSY an attempt to modify an attribute that cannot be changed once + the device has been initialised. Please reset device first; +-ENOMEM zram was not able to allocate enough memory to fulfil your + needs; +-EINVAL invalid input has been provided. +======== ============================================================= + +If you use 'echo', the returned value that is changed by 'echo' utility, +and, in general case, something like:: + + echo 3 > /sys/block/zram0/max_comp_streams + if [ $? -ne 0 ]; + handle_error + fi + +should suffice. + +1) Load Module +============== + +:: + + modprobe zram num_devices=4 + This creates 4 devices: /dev/zram{0,1,2,3} + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. + +2) Set max number of compression streams +======================================== + +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available:: + + cat /sys/block/zram0/max_comp_streams + +3) Select compression algorithm +=============================== + +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). + +Examples:: + + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +For the time being, the `comp_algorithm` content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + +4) Set Disksize +=============== + +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples:: + + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize + + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize + +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + +5) Set memory limit: Optional +============================= + +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples:: + + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate +=========== + +:: + + mkswap /dev/zram0 + swapon /dev/zram0 + + mkfs.ext4 /dev/zram1 + mount /dev/zram1 /tmp + +7) Add/remove zram devices +========================== + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram) or error code. + +Example:: + + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute:: + + echo X > /sys/class/zram-control/hot_remove + +8) Stats +======== + +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attributes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +====================== ====== =============================================== +Name access description +====================== ====== =============================================== +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +mem_used_max WO reset the `mem_used_max` counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can + use to store the compressed data +writeback_limit WO specifies the maximum amount of write IO zram + can write out to backing device as 4KB unit +writeback_limit_enable RW show and set writeback_limit feature +max_comp_streams RW the number of possible concurrent compress + operations +comp_algorithm RW show and change the compression algorithm +compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out +idle WO mark allocated slot as idle +====================== ====== =============================================== + + +User space is advised to use the following files to read the device statistics. + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.txt for +details. + +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + + ============= ============================================================= + failed_reads The number of failed reads + failed_writes The number of failed writes + invalid_io The number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + + a) the number of pages freed because of swap slot free + notifications + b) the number of pages freed because of + REQ_OP_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. + ============= ============================================================= + +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + + ================ ============================================================= + orig_data_size uncompressed size of data stored in this disk. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + same_pages the number of same element filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + ================ ============================================================= + +File /sys/block/zram/bd_stat + +The stat file represents device's backing device statistics. It consists of +a single line of text and contains the following stats separated by whitespace: + + ============== ============================================================= + bd_count size of data written in backing device. + Unit: 4K bytes + bd_reads the number of reads from backing device + Unit: 4K bytes + bd_writes the number of writes to backing device + Unit: 4K bytes + ============== ============================================================= + +9) Deactivate +============= + +:: + + swapoff /dev/zram0 + umount /dev/zram1 + +10) Reset +========= + + Write any positive value to 'reset' sysfs node:: + + echo 1 > /sys/block/zram0/reset + echo 1 > /sys/block/zram1/reset + + This frees all the memory allocated for the given device and + resets the disksize to zero. You must set the disksize again + before reusing the device. + +Optional Feature +================ + +writeback +--------- + +With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page +to backing storage rather than keeping it in memory. +To use the feature, admin should set up backing device via:: + + echo /dev/sda5 > /sys/block/zramX/backing_dev + +before disksize setting. It supports only partition at this moment. +If admin want to use incompressible page writeback, they could do via:: + + echo huge > /sys/block/zramX/write + +To use idle page writeback, first, user need to declare zram pages +as idle:: + + echo all > /sys/block/zramX/idle + +From now on, any pages on zram are idle pages. The idle mark +will be removed until someone request access of the block. +IOW, unless there is access request, those pages are still idle pages. + +Admin can request writeback of those idle pages at right timing via:: + + echo idle > /sys/block/zramX/writeback + +With the command, zram writeback idle pages from memory to the storage. + +If there are lots of write IO with flash device, potentially, it has +flash wearout problem so that admin needs to design write limitation +to guarantee storage health for entire product life. + +To overcome the concern, zram supports "writeback_limit" feature. +The "writeback_limit_enable"'s default value is 0 so that it doesn't limit +any writeback. IOW, if admin want to apply writeback budget, he should +enable writeback_limit_enable via:: + + $ echo 1 > /sys/block/zramX/writeback_limit_enable + +Once writeback_limit_enable is set, zram doesn't allow any writeback +until admin set the budget via /sys/block/zramX/writeback_limit. + +(If admin doesn't enable writeback_limit_enable, writeback_limit's value +assigned via /sys/block/zramX/writeback_limit is meaninless.) + +If admin want to limit writeback as per-day 400M, he could do it +like below:: + + $ MB_SHIFT=20 + $ 4K_SHIFT=12 + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit. + $ echo 1 > /sys/block/zram0/writeback_limit_enable + +If admin want to allow further write again once the bugdet is exausted, +he could do it like below:: + + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit + +If admin want to see remaining writeback budget since he set:: + + $ cat /sys/block/zramX/writeback_limit + +If admin want to disable writeback limit, he could do:: + + $ echo 0 > /sys/block/zramX/writeback_limit_enable + +The writeback_limit count will reset whenever you reset zram(e.g., +system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of +writeback happened until you reset the zram to allocate extra writeback +budget in next setting is user's job. + +If admin want to measure writeback count in a certain period, he could +know it via /sys/block/zram0/bd_stat's 3rd column. + +memory tracking +=============== + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. + +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: + + 300 75.033841 .wh. + 301 63.806904 s... + 302 63.806919 ..hi + +First column + zram's block index. +Second column + access time since the system was booted +Third column + state of the block: + + s: + same page + w: + written page to backing store + h: + huge page + i: + idle page + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + +Nitin Gupta +ngupta@vflare.org diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt deleted file mode 100644 index 4df0ce271085..000000000000 --- a/Documentation/blockdev/zram.txt +++ /dev/null @@ -1,355 +0,0 @@ -zram: Compressed RAM based block devices ----------------------------------------- - -* Introduction - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -* Usage - -There are several ways to configure and manage zram device(-s): -a) using zram and zram_control sysfs attributes -b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). - -In this document we will describe only 'manual' zram configuration steps, -IOW, zram and zram_control sysfs attributes. - -In order to get a better idea about zramctl please consult util-linux -documentation, zramctl man-page or `zramctl --help'. Please be informed -that zram maintainers do not develop/maintain util-linux or zramctl, should -you have any questions please contact util-linux@vger.kernel.org - -Following shows a typical sequence of steps for using zram. - -WARNING -======= -For the sake of simplicity we skip error checking parts in most of the -examples below. However, it is your sole responsibility to handle errors. - -zram sysfs attributes always return negative values in case of errors. -The list of possible return codes: --EBUSY -- an attempt to modify an attribute that cannot be changed once -the device has been initialised. Please reset device first; --ENOMEM -- zram was not able to allocate enough memory to fulfil your -needs; --EINVAL -- invalid input has been provided. - -If you use 'echo', the returned value that is changed by 'echo' utility, -and, in general case, something like: - - echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; - handle_error - fi - -should suffice. - -1) Load Module: - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - -num_devices parameter is optional and tells zram how many devices should be -pre-created. Default: 1. - -2) Set max number of compression streams -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus -allowing several concurrent compression operations. The number of -allocated compression streams goes down when some of the CPUs -become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. - -To find out how many streams are currently available: - cat /sys/block/zram0/max_comp_streams - -3) Select compression algorithm -Using comp_algorithm device attribute one can see available and -currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised -there is no way to change compression algorithm). - -Examples: - #show supported compression algorithms - cat /sys/block/zram0/comp_algorithm - lzo [lz4] - - #select lzo compression algorithm - echo lzo > /sys/block/zram0/comp_algorithm - -For the time being, the `comp_algorithm' content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). - -4) Set Disksize -Set disk size by writing the value to sysfs node 'disksize'. -The value can be either in bytes or you can use mem suffixes. -Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -Note: -There is little point creating a zram of greater than twice the size of memory -since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the -size of the disk when not in use so a huge zram is wasteful. - -5) Set memory limit: Optional -Set memory limit by writing the value to sysfs node 'mem_limit'. -The value can be either in bytes or you can use mem suffixes. -In addition, you could change the value in runtime. -Examples: - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit - -6) Activate: - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -7) Add/remove zram devices - -zram provides a control interface, which enables dynamic (on-demand) device -addition and removal. - -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. - -Example: - cat /sys/class/zram-control/hot_add - 1 - -To remove the existing /dev/zramX device (where X is a device id) -execute - echo X > /sys/class/zram-control/hot_remove - -8) Stats: -Per-device statistics are exported as various nodes under /sys/block/zram/ - -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. - -Name access description ----- ------ ----------- -disksize RW show and set the device's disk size -initstate RO shows the initialization state of the device -reset WO trigger device reset -mem_used_max WO reset the `mem_used_max' counter (see later) -mem_limit WO specifies the maximum amount of memory ZRAM can use - to store the compressed data -writeback_limit WO specifies the maximum amount of write IO zram can - write out to backing device as 4KB unit -writeback_limit_enable RW show and set writeback_limit feature -max_comp_streams RW the number of possible concurrent compress operations -comp_algorithm RW show and change the compression algorithm -compact WO trigger memory compaction -debug_stat RO this file is used for zram debugging purposes -backing_dev RW set up backend storage for zram to write out -idle WO mark allocated slot as idle - - -User space is advised to use the following files to read the device statistics. - -File /sys/block/zram/stat - -Represents block layer statistics. Read Documentation/block/stat.txt for -details. - -File /sys/block/zram/io_stat - -The stat file represents device's I/O statistics not accounted by block -layer and, thus, not available in zram/stat file. It consists of a -single line of text and contains the following stats separated by -whitespace: - failed_reads the number of failed reads - failed_writes the number of failed writes - invalid_io the number of non-page-size-aligned I/O requests - notify_free Depending on device usage scenario it may account - a) the number of pages freed because of swap slot free - notifications or b) the number of pages freed because of - REQ_OP_DISCARD requests sent by bio. The former ones are - sent to a swap block device when a swap slot is freed, - which implies that this disk is being used as a swap disk. - The latter ones are sent by filesystem mounted with - discard option, whenever some data blocks are getting - discarded. - -File /sys/block/zram/mm_stat - -The stat file represents device's mm statistics. It consists of a single -line of text and contains the following stats separated by whitespace: - orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. - Unit: bytes - compr_data_size compressed size of data stored in this disk - mem_used_total the amount of memory allocated for this disk. This - includes allocator fragmentation and metadata overhead, - allocated for this disk. So, allocator space efficiency - can be calculated using compr_data_size and this statistic. - Unit: bytes - mem_limit the maximum amount of memory ZRAM can use to store - the compressed data - mem_used_max the maximum amount of memory zram have consumed to - store the data - same_pages the number of same element filled pages written to this disk. - No memory is allocated for such pages. - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - -File /sys/block/zram/bd_stat - -The stat file represents device's backing device statistics. It consists of -a single line of text and contains the following stats separated by whitespace: - bd_count size of data written in backing device. - Unit: 4K bytes - bd_reads the number of reads from backing device - Unit: 4K bytes - bd_writes the number of writes to backing device - Unit: 4K bytes - -9) Deactivate: - swapoff /dev/zram0 - umount /dev/zram1 - -10) Reset: - Write any positive value to 'reset' sysfs node - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -* Optional Feature - -= writeback - -With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page -to backing storage rather than keeping it in memory. -To use the feature, admin should set up backing device via - - "echo /dev/sda5 > /sys/block/zramX/backing_dev" - -before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via - - "echo huge > /sys/block/zramX/write" - -To use idle page writeback, first, user need to declare zram pages -as idle. - - "echo all > /sys/block/zramX/idle" - -From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. -IOW, unless there is access request, those pages are still idle pages. - -Admin can request writeback of those idle pages at right timing via - - "echo idle > /sys/block/zramX/writeback" - -With the command, zram writeback idle pages from memory to the storage. - -If there are lots of write IO with flash device, potentially, it has -flash wearout problem so that admin needs to design write limitation -to guarantee storage health for entire product life. - -To overcome the concern, zram supports "writeback_limit" feature. -The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should -enable writeback_limit_enable via - - $ echo 1 > /sys/block/zramX/writeback_limit_enable - -Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. - -(If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) - -If admin want to limit writeback as per-day 400M, he could do it -like below. - - $ MB_SHIFT=20 - $ 4K_SHIFT=12 - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit. - $ echo 1 > /sys/block/zram0/writeback_limit_enable - -If admin want to allow further write again once the bugdet is exausted, -he could do it like below - - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit - -If admin want to see remaining writeback budget since he set, - - $ cat /sys/block/zramX/writeback_limit - -If admin want to disable writeback limit, he could do - - $ echo 0 > /sys/block/zramX/writeback_limit_enable - -The writeback_limit count will reset whenever you reset zram(e.g., -system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of -writeback happened until you reset the zram to allocate extra writeback -budget in next setting is user's job. - -If admin want to measure writeback count in a certain period, he could -know it via /sys/block/zram0/bd_stat's 3rd column. - -= memory tracking - -With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the -zram block. It could be useful to catch cold or incompressible -pages of the process with*pagemap. -If you enable the feature, you could see block state via -/sys/kernel/debug/zram/zram0/block_state". The output is as follows, - - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi - -First column is zram's block index. -Second column is access time since the system was booted -Third column is state of the block. -(s: same page -w: written page to backing store -h: huge page -i: idle page) - -First line of above example says 300th block is accessed at 75.033841sec -and the block's state is huge so it is written back to the backing -storage. It's a debugging feature so anyone shouldn't rely on it to work -properly. - -Nitin Gupta -ngupta@vflare.org diff --git a/MAINTAINERS b/MAINTAINERS index 3ee73751f56c..ec541c8dc645 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11076,7 +11076,7 @@ M: Josef Bacik S: Maintained L: linux-block@vger.kernel.org L: nbd@other.debian.org -F: Documentation/blockdev/nbd.txt +F: Documentation/blockdev/nbd.rst F: drivers/block/nbd.c F: include/trace/events/nbd.h F: include/uapi/linux/nbd.h @@ -12086,7 +12086,7 @@ PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh L: linux-parport@lists.infradead.org (subscribers-only) S: Maintained -F: Documentation/blockdev/paride.txt +F: Documentation/blockdev/paride.rst F: drivers/block/paride/ PARISC ARCHITECTURE @@ -13367,7 +13367,7 @@ F: drivers/net/wireless/ralink/rt2x00/ RAMDISK RAM BLOCK DEVICE DRIVER M: Jens Axboe S: Maintained -F: Documentation/blockdev/ramdisk.txt +F: Documentation/blockdev/ramdisk.rst F: drivers/block/brd.c RANCHU VIRTUAL BOARD FOR MIPS @@ -17723,7 +17723,7 @@ R: Sergey Senozhatsky L: linux-kernel@vger.kernel.org S: Maintained F: drivers/block/zram/ -F: Documentation/blockdev/zram.txt +F: Documentation/blockdev/zram.rst ZS DECSTATION Z85C30 SERIAL DRIVER M: "Maciej W. Rozycki" diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 96ec7e0fc1ea..c43690b973d8 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -31,7 +31,7 @@ config BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, say Y. Information about this driver, especially important for IBM Thinkpad users, is contained in - . + . That file also contains the location of the Floppy driver FAQ as well as location of the fdutils package used to configure additional parameters of the driver at run time. @@ -96,7 +96,7 @@ config PARIDE your computer's parallel port. Most of them are actually IDE devices using a parallel port IDE adapter. This option enables the PARIDE subsystem which contains drivers for many of these external drives. - Read for more information. + Read for more information. If you have said Y to the "Parallel-port support" configuration option, you may share a single port between your printer and other @@ -261,7 +261,7 @@ config BLK_DEV_NBD userland (making server and client physically the same computer, communicating using the loopback network device). - Read for more information, + Read for more information, especially about where to find the server code, which runs in user space and does not need special kernel support. @@ -303,7 +303,7 @@ config BLK_DEV_RAM during the initial install of Linux. Note that the kernel command line option "ramdisk=XX" is now obsolete. - For details, read . + For details, read . To compile this driver as a module, choose M here: the module will be called brd. An alias "rd" has been defined diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b933a7eea52b..5c99e52f9dc1 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4424,7 +4424,7 @@ static int __init floppy_setup(char *str) pr_cont("\n"); } else DPRINT("botched floppy option\n"); - DPRINT("Read Documentation/blockdev/floppy.txt\n"); + DPRINT("Read Documentation/blockdev/floppy.rst\n"); return 0; } diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 1ffc64770643..e06b99d54816 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -12,7 +12,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" @@ -26,7 +26,7 @@ config ZRAM_WRITEBACK With /sys/block/zramX/{idle,writeback}, application could ask idle page's writeback to the backing device to save in memory. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. config ZRAM_MEMORY_TRACKING bool "Track zRam block status" @@ -36,4 +36,4 @@ config ZRAM_MEMORY_TRACKING of zRAM. Admin could see the information via /sys/kernel/debug/zram/zramX/block_state. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 7972cc512408..5fa378391d3b 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -37,4 +37,4 @@ Commands required for testing: - mkfs/ mkfs.ext4 For more information please refer: -kernel-source-tree/Documentation/blockdev/zram.txt +kernel-source-tree/Documentation/blockdev/zram.rst -- cgit v1.2.3 From e7751617dd0599ceadf4221cb08e04307b00aa1f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 11:47:10 -0300 Subject: docs: blockdev: add it to the admin-guide The blockdev book basically contains user-faced documentation. Signed-off-by: Mauro Carvalho Chehab --- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 +++++++++++++++++++++ .../blockdev/drbd/DRBD-data-packets.svg | 459 ++++++++++++++++ .../admin-guide/blockdev/drbd/conn-states-8.dot | 18 + .../blockdev/drbd/data-structure-v9.rst | 42 ++ .../admin-guide/blockdev/drbd/disk-states-8.dot | 16 + .../drbd/drbd-connection-state-overview.dot | 85 +++ .../admin-guide/blockdev/drbd/figures.rst | 28 + Documentation/admin-guide/blockdev/drbd/index.rst | 19 + .../admin-guide/blockdev/drbd/node-states-8.dot | 13 + Documentation/admin-guide/blockdev/floppy.rst | 255 +++++++++ Documentation/admin-guide/blockdev/index.rst | 14 + Documentation/admin-guide/blockdev/nbd.rst | 31 ++ Documentation/admin-guide/blockdev/paride.rst | 439 +++++++++++++++ Documentation/admin-guide/blockdev/ramdisk.rst | 177 +++++++ Documentation/admin-guide/blockdev/zram.rst | 422 +++++++++++++++ Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 18 +- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 --------------------- Documentation/blockdev/drbd/DRBD-data-packets.svg | 459 ---------------- Documentation/blockdev/drbd/conn-states-8.dot | 18 - Documentation/blockdev/drbd/data-structure-v9.rst | 42 -- Documentation/blockdev/drbd/disk-states-8.dot | 16 - .../drbd/drbd-connection-state-overview.dot | 85 --- Documentation/blockdev/drbd/figures.rst | 28 - Documentation/blockdev/drbd/index.rst | 19 - Documentation/blockdev/drbd/node-states-8.dot | 14 - Documentation/blockdev/floppy.rst | 255 --------- Documentation/blockdev/index.rst | 16 - Documentation/blockdev/nbd.rst | 31 -- Documentation/blockdev/paride.rst | 439 --------------- Documentation/blockdev/ramdisk.rst | 177 ------- Documentation/blockdev/zram.rst | 422 --------------- MAINTAINERS | 10 +- drivers/block/Kconfig | 8 +- drivers/block/floppy.c | 2 +- drivers/block/zram/Kconfig | 6 +- tools/testing/selftests/zram/README | 2 +- 37 files changed, 2630 insertions(+), 2632 deletions(-) create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/conn-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/disk-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/figures.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/index.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/node-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/floppy.rst create mode 100644 Documentation/admin-guide/blockdev/index.rst create mode 100644 Documentation/admin-guide/blockdev/nbd.rst create mode 100644 Documentation/admin-guide/blockdev/paride.rst create mode 100644 Documentation/admin-guide/blockdev/ramdisk.rst create mode 100644 Documentation/admin-guide/blockdev/zram.rst delete mode 100644 Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/DRBD-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/conn-states-8.dot delete mode 100644 Documentation/blockdev/drbd/data-structure-v9.rst delete mode 100644 Documentation/blockdev/drbd/disk-states-8.dot delete mode 100644 Documentation/blockdev/drbd/drbd-connection-state-overview.dot delete mode 100644 Documentation/blockdev/drbd/figures.rst delete mode 100644 Documentation/blockdev/drbd/index.rst delete mode 100644 Documentation/blockdev/drbd/node-states-8.dot delete mode 100644 Documentation/blockdev/floppy.rst delete mode 100644 Documentation/blockdev/index.rst delete mode 100644 Documentation/blockdev/nbd.rst delete mode 100644 Documentation/blockdev/paride.rst delete mode 100644 Documentation/blockdev/ramdisk.rst delete mode 100644 Documentation/blockdev/zram.rst (limited to 'tools') diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg new file mode 100644 index 000000000000..f87cfa0dc2fb --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg @@ -0,0 +1,588 @@ + + + + + + Master slide + + + + + + + + + + RSDataReply + + + + + + + CsumRSRequest + + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + + WriteAck + + + + got_BlockAck() + + + Checksum based Resync, case not in sync + + + DRBD-8.3 data flow + + + w_e_send_csum() + + + + + + + + RSIsInSync + + + + + + + CsumRSRequest + + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + got_IsInSync() + + + Checksum based Resync, case in sync + + + + + + + + + + OVReply + + + + + + + OVRequest + + + + receive_OVRequest() + + + drbd_endio_read_sec() + + + w_e_end_ov_req() + + + receive_OVReply() + + + drbd_endio_read_sec() + + + w_e_end_ov_reply() + + + + + + OVResult + + + + got_OVResult() + + + Online verify + + + w_make_ov_request() + + + + + + + + drbd_endio_read_sec() + + + w_make_resync_request() + + + w_e_send_csum() + + + + + drbd_endio_read_sec() + + + + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg new file mode 100644 index 000000000000..48a1e2165fec --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg @@ -0,0 +1,459 @@ + + + + + + Master slide + + + + + + + + + RSDataReply + + + + + RSDataRequest + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_rsdata_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + WriteAck + + + got_BlockAck() + + + Resync blocks, 4-32K + + + + + + + WriteAck + + + + + Data + + + drbd_make_request() + + + receive_Data() + + + drbd_endio_write_sec() + + + e_end_block() + + + got_BlockAck() + + + Regular mirrored write, 512-32K + + + w_send_dblock() + + + + + drbd_endio_write_pri() + + + + + + + DataReply + + + + + DataRequest + + + drbd_make_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_data_req() + + + Drawing + + receive_DataReply() + + + + Diskless read, 512-32K + + + w_send_read_req() + + + DRBD 8 data flow + + + + + + al_begin_io() + + + al_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot new file mode 100644 index 000000000000..025e8cf5e64a --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot @@ -0,0 +1,18 @@ +digraph conn_states { + StandAllone -> WFConnection [ label = "ioctl_set_net()" ] + WFConnection -> Unconnected [ label = "unable to bind()" ] + WFConnection -> WFReportParams [ label = "in connect() after accept" ] + WFReportParams -> StandAllone [ label = "checks in receive_param()" ] + WFReportParams -> Connected [ label = "in receive_param()" ] + WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] + WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] + WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] + WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] + SyncSource -> Connected + SyncTarget -> Connected + SyncSource -> PausedSyncS + SyncTarget -> PausedSyncT + PausedSyncS -> SyncSource + PausedSyncT -> SyncTarget + Connected -> WFConnection [ label = "* on network error" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst new file mode 100644 index 000000000000..66036b901644 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst @@ -0,0 +1,42 @@ +================================ +kernel data structure for DRBD-9 +================================ + +This describes the in kernel data structure for DRBD-9. Starting with +Linux v3.14 we are reorganizing DRBD to use this data structure. + +Basic Data Structure +==================== + +A node has a number of DRBD resources. Each such resource has a number of +devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD +device is represented by a block device locally. + +The DRBD objects are interconnected to form a matrix as depicted below; a +drbd_peer_device object sits at each intersection between a drbd_device and a +drbd_connection:: + + /--------------+---------------+.....+---------------\ + | resource | device | | device | + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + +--------------+---------------+.....+---------------+ + : : : : : + : : : : : + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + \--------------+---------------+.....+---------------/ + +In this table, horizontally, devices can be accessed from resources by their +volume number. Likewise, peer_devices can be accessed from connections by +their volume number. Objects in the vertical direction are connected by double +linked lists. There are back pointers from peer_devices to their connections a +devices, and from connections and devices to their resource. + +All resources are in the drbd_resources double-linked list. In addition, all +devices can be accessed by their minor device number via the drbd_devices idr. + +The drbd_resource, drbd_connection, and drbd_device objects are reference +counted. The peer_device objects only serve to establish the links between +devices and connections; their lifetime is determined by the lifetime of the +device and connection which they reference. diff --git a/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot new file mode 100644 index 000000000000..d06cfb46fb98 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot @@ -0,0 +1,16 @@ +digraph disk_states { + Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] + Diskless -> Consistent [ label = "ioctl_set_disk()" ] + Diskless -> Outdated [ label = "ioctl_set_disk()" ] + Consistent -> Outdated [ label = "receive_param()" ] + Consistent -> UpToDate [ label = "receive_param()" ] + Consistent -> Inconsistent [ label = "start resync" ] + Outdated -> Inconsistent [ label = "start resync" ] + UpToDate -> Inconsistent [ label = "ioctl_replicate" ] + Inconsistent -> UpToDate [ label = "resync completed" ] + Consistent -> Failed [ label = "io completion error" ] + Outdated -> Failed [ label = "io completion error" ] + UpToDate -> Failed [ label = "io completion error" ] + Inconsistent -> Failed [ label = "io completion error" ] + Failed -> Diskless [ label = "sending notify to peer" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot new file mode 100644 index 000000000000..6d9cf0a7b11d --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot @@ -0,0 +1,85 @@ +// vim: set sw=2 sts=2 : +digraph { + rankdir=BT + bgcolor=white + + node [shape=plaintext] + node [fontcolor=black] + + StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] + + node [fontcolor=lightgray] + + Unconnected [ label=Unconnected ] + + CommTrouble [ shape=record, + label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] + + node [fontcolor=gray] + + subgraph cluster_try_connect { + label="try to connect, handshake" + rank=max + WFConnection [ label=WFConnection ] + WFReportParams [ label=WFReportParams ] + } + + TearDown [ label=TearDown ] + + Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] + + node [fontcolor=lightblue] + + StartingSyncS [ label=StartingSyncS ] + StartingSyncT [ label=StartingSyncT ] + + subgraph cluster_bitmap_exchange { + node [fontcolor=red] + fontcolor=red + label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" + + WFBitMapT [ label=WFBitMapT ] + WFSyncUUID [ label=WFSyncUUID ] + WFBitMapS [ label=WFBitMapS ] + } + + node [fontcolor=blue] + + cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] + + node [shape=box,fontcolor=black] + + // drbdadm [label="drbdadm connect"] + // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] + // comm_error [label="communication trouble"] + + // + // edges + // -------------------------------------- + + StandAlone -> Unconnected [ label="drbdadm connect" ] + Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] + Unconnected -> WFConnection [ label="receiver thread is started" ] + WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] + + WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] + WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] + + WFReportParams -> WFBitMapS + WFReportParams -> WFBitMapT + WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] + + WFBitMapS -> cluster_resync:S + WFSyncUUID -> cluster_resync:T + + edge [color=green] + cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] + + edge [color=red] + WFReportParams -> CommTrouble + Connected -> CommTrouble + cluster_resync:any -> CommTrouble + edge [color=black] + CommTrouble -> Unconnected [label="receiver thread is stopped" ] + +} diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst new file mode 100644 index 000000000000..3e3fd4b8a478 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/figures.rst @@ -0,0 +1,28 @@ +.. The here included files are intended to help understand the implementation + +Data flows that Relate some functions, and write packets +======================================================== + +.. kernel-figure:: DRBD-8.3-data-packets.svg + :alt: DRBD-8.3-data-packets.svg + :align: center + +.. kernel-figure:: DRBD-data-packets.svg + :alt: DRBD-data-packets.svg + :align: center + + +Sub graphs of DRBD's state transitions +====================================== + +.. kernel-figure:: conn-states-8.dot + :alt: conn-states-8.dot + :align: center + +.. kernel-figure:: disk-states-8.dot + :alt: disk-states-8.dot + :align: center + +.. kernel-figure:: node-states-8.dot + :alt: node-states-8.dot + :align: center diff --git a/Documentation/admin-guide/blockdev/drbd/index.rst b/Documentation/admin-guide/blockdev/drbd/index.rst new file mode 100644 index 000000000000..68ecd5c113e9 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/index.rst @@ -0,0 +1,19 @@ +========================================== +Distributed Replicated Block Device - DRBD +========================================== + +Description +=========== + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +.. toctree:: + :maxdepth: 1 + + data-structure-v9 + figures diff --git a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot new file mode 100644 index 000000000000..bfa54e1f8016 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot @@ -0,0 +1,13 @@ +digraph node_states { + Secondary -> Primary [ label = "ioctl_set_state()" ] + Primary -> Secondary [ label = "ioctl_set_state()" ] +} + +digraph peer_states { + Secondary -> Primary [ label = "recv state packet" ] + Primary -> Secondary [ label = "recv state packet" ] + Primary -> Unknown [ label = "connection lost" ] + Secondary -> Unknown [ label = "connection lost" ] + Unknown -> Primary [ label = "connected" ] + Unknown -> Secondary [ label = "connected" ] +} diff --git a/Documentation/admin-guide/blockdev/floppy.rst b/Documentation/admin-guide/blockdev/floppy.rst new file mode 100644 index 000000000000..4a8f31cf4139 --- /dev/null +++ b/Documentation/admin-guide/blockdev/floppy.rst @@ -0,0 +1,255 @@ +============= +Floppy Driver +============= + +FAQ list: +========= + +A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + +The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + +Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad):: + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9:: + + append = "floppy=thinkpad" + +Several floppy related options may be given, example:: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + +If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + +If you use the floppy driver as a module, use the following syntax:: + + modprobe floppy floppy="" + +Example:: + + modprobe floppy floppy="omnibook messages" + +If you need certain options enabled every time you load the floppy driver, +you can put:: + + options floppy floppy="omnibook messages" + +in a configuration file in /etc/modprobe.d/. + + +The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc / floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook / floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + == ================================== + 0 Use the value of the physical CMOS + 1 5 1/4 DD + 2 5 1/4 HD + 3 3 1/2 DD + 4 3 1/2 HD + 5 3 1/2 ED + 6 3 1/2 ED + 16 unknown or not installed + == ================================== + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts / floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate:: + + PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases. + + +Supporting utilities and additional documentation: +================================================== + +Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + + http://fdutils.linux.lu + +The fdutils releases can be found at: + + http://fdutils.linux.lu/download.html + + http://www.tux.org/pub/knaff/fdutils/ + + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + +If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + +Be sure to read the FAQ before mailing/posting any bug reports! + +Alain + +Changelog +========= + +10-30-2004 : + Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : + Original Document diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst new file mode 100644 index 000000000000..20a738d9d047 --- /dev/null +++ b/Documentation/admin-guide/blockdev/index.rst @@ -0,0 +1,14 @@ +=========================== +The Linux RapidIO Subsystem +=========================== + +.. toctree:: + :maxdepth: 1 + + floppy + nbd + paride + ramdisk + zram + + drbd/index diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst new file mode 100644 index 000000000000..d78dfe559dcf --- /dev/null +++ b/Documentation/admin-guide/blockdev/nbd.rst @@ -0,0 +1,31 @@ +================================== +Network Block Device (TCP version) +================================== + +1) Overview +----------- + +What is it: With this compiled in the kernel (or as a module), Linux +can use a remote server as one of its block devices. So every time +the client computer wants to read, e.g., /dev/nb0, it sends a +request over TCP to the server, which will reply with the data read. +This can be used for stations with low disk space (or even diskless) +to borrow disk space from another computer. +Unlike NFS, it is possible to put any filesystem on it, etc. + +For more information, or to download the nbd-client and nbd-server +tools, go to http://nbd.sf.net/. + +The nbd kernel module need only be installed on the client +system, as the nbd-server is completely in userspace. In fact, +the nbd-server has been successfully ported to other operating +systems, including Windows. + +A) NBD parameters +----------------- + +max_part + Number of partitions per device (default: 0). + +nbds_max + Number of block devices that should be initialized (default: 16). diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst new file mode 100644 index 000000000000..87b4278bf314 --- /dev/null +++ b/Documentation/admin-guide/blockdev/paride.rst @@ -0,0 +1,439 @@ +=================================== +Linux and parallel port IDE devices +=================================== + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction +=============== + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + - MicroSolutions backpack CD-ROM + - MicroSolutions backpack PD/CD + - MicroSolutions backpack hard-drives + - MicroSolutions backpack 8000t tape drive + - SyQuest EZ-135, EZ-230 & SparQ drives + - Avatar Shark + - Imation Superdisk LS-120 + - Maxell Superdisk LS-120 + - FreeCom Power CD + - Hewlett-Packard 5GB and 8GB tape drives + - Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + === ============= + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + === ============= + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + ==== ====================================== ==== + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + ==== ====================================== ==== + + +2. Using the PARIDE subsystem +============================= + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + ================ ============ ====== ======== + Manufacturer Model Driver Protocol + ================ ============ ====== ======== + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + ================ ============ ====== ======== + +2.1 Configuring built-in drivers +--------------------------------- + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like:: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command:: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules +---------------------------------------------- + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: + using these drivers with the "kerneld" automatic module loading + system is not recommended for beginners, and is not documented here. + +Note 2: + if you build PARPORT support as a loadable module, PARIDE must + also be built as loadable modules, and PARPORT must be loaded before + the PARIDE modules. + +To use PARIDE, you must begin by:: + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example:: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command:: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378:: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device +-------------------------- + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute:: + + #!/bin/bash + # + # mkd -- a script to create the device special files for the PARIDE subsystem + # + function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 + } + # + function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done + } + # + cd /dev + # + for u in 0 1 2 3 ; do pd $u ; done + for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done + for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done + for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done + for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done + for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done + # + # end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like:: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system:: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver +------------------ + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver +------------------------ + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver +------------------------ + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting +================== + +3.1 Use EPP mode if you can +---------------------------- + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay +------------------------- + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset +------------------------------------- + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing:: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help +------------------------------------------------------ + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in:: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help +--------------------------------- + +You can join the linux-parport mailing list by sending a mail message +to: + + linux-parport-request@torque.net + +with the single word:: + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/admin-guide/blockdev/ramdisk.rst b/Documentation/admin-guide/blockdev/ramdisk.rst new file mode 100644 index 000000000000..b7c2268f8dec --- /dev/null +++ b/Documentation/admin-guide/blockdev/ramdisk.rst @@ -0,0 +1,177 @@ +========================================== +Using the RAM disk block device with Linux +========================================== + +.. Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Parameters +--------------------------------- + +2a) Kernel Command Line Parameters + + ramdisk_size=N + Size of the ramdisk. + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB). + +2b) Module parameters + + rd_nr + /dev/ramX devices created. + + max_part + Maximum partition number. + + rd_size + See ramdisk_size. + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below:: + + ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF + ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 + ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do:: + + /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use:: + + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" + +Since the default start = 0 and the default prompt = 1, you could use:: + + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +----------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create:: + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example:: + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing:: + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy:: + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB):: + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552:: + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + + + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : + Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : + Original Document diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst new file mode 100644 index 000000000000..6eccf13219ff --- /dev/null +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -0,0 +1,422 @@ +======================================== +zram: Compressed RAM based block devices +======================================== + +Introduction +============ + +The zram module creates RAM based block devices named /dev/zram +( = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram/ + +Usage +===== + +There are several ways to configure and manage zram device(-s): + +a) using zram and zram_control sysfs attributes +b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). + +In this document we will describe only 'manual' zram configuration steps, +IOW, zram and zram_control sysfs attributes. + +In order to get a better idea about zramctl please consult util-linux +documentation, zramctl man-page or `zramctl --help`. Please be informed +that zram maintainers do not develop/maintain util-linux or zramctl, should +you have any questions please contact util-linux@vger.kernel.org + +Following shows a typical sequence of steps for using zram. + +WARNING +======= + +For the sake of simplicity we skip error checking parts in most of the +examples below. However, it is your sole responsibility to handle errors. + +zram sysfs attributes always return negative values in case of errors. +The list of possible return codes: + +======== ============================================================= +-EBUSY an attempt to modify an attribute that cannot be changed once + the device has been initialised. Please reset device first; +-ENOMEM zram was not able to allocate enough memory to fulfil your + needs; +-EINVAL invalid input has been provided. +======== ============================================================= + +If you use 'echo', the returned value that is changed by 'echo' utility, +and, in general case, something like:: + + echo 3 > /sys/block/zram0/max_comp_streams + if [ $? -ne 0 ]; + handle_error + fi + +should suffice. + +1) Load Module +============== + +:: + + modprobe zram num_devices=4 + This creates 4 devices: /dev/zram{0,1,2,3} + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. + +2) Set max number of compression streams +======================================== + +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available:: + + cat /sys/block/zram0/max_comp_streams + +3) Select compression algorithm +=============================== + +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). + +Examples:: + + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +For the time being, the `comp_algorithm` content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + +4) Set Disksize +=============== + +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples:: + + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize + + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize + +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + +5) Set memory limit: Optional +============================= + +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples:: + + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate +=========== + +:: + + mkswap /dev/zram0 + swapon /dev/zram0 + + mkfs.ext4 /dev/zram1 + mount /dev/zram1 /tmp + +7) Add/remove zram devices +========================== + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram) or error code. + +Example:: + + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute:: + + echo X > /sys/class/zram-control/hot_remove + +8) Stats +======== + +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attributes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +====================== ====== =============================================== +Name access description +====================== ====== =============================================== +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +mem_used_max WO reset the `mem_used_max` counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can + use to store the compressed data +writeback_limit WO specifies the maximum amount of write IO zram + can write out to backing device as 4KB unit +writeback_limit_enable RW show and set writeback_limit feature +max_comp_streams RW the number of possible concurrent compress + operations +comp_algorithm RW show and change the compression algorithm +compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out +idle WO mark allocated slot as idle +====================== ====== =============================================== + + +User space is advised to use the following files to read the device statistics. + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.rst for +details. + +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + + ============= ============================================================= + failed_reads The number of failed reads + failed_writes The number of failed writes + invalid_io The number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + + a) the number of pages freed because of swap slot free + notifications + b) the number of pages freed because of + REQ_OP_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. + ============= ============================================================= + +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + + ================ ============================================================= + orig_data_size uncompressed size of data stored in this disk. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + same_pages the number of same element filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + ================ ============================================================= + +File /sys/block/zram/bd_stat + +The stat file represents device's backing device statistics. It consists of +a single line of text and contains the following stats separated by whitespace: + + ============== ============================================================= + bd_count size of data written in backing device. + Unit: 4K bytes + bd_reads the number of reads from backing device + Unit: 4K bytes + bd_writes the number of writes to backing device + Unit: 4K bytes + ============== ============================================================= + +9) Deactivate +============= + +:: + + swapoff /dev/zram0 + umount /dev/zram1 + +10) Reset +========= + + Write any positive value to 'reset' sysfs node:: + + echo 1 > /sys/block/zram0/reset + echo 1 > /sys/block/zram1/reset + + This frees all the memory allocated for the given device and + resets the disksize to zero. You must set the disksize again + before reusing the device. + +Optional Feature +================ + +writeback +--------- + +With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page +to backing storage rather than keeping it in memory. +To use the feature, admin should set up backing device via:: + + echo /dev/sda5 > /sys/block/zramX/backing_dev + +before disksize setting. It supports only partition at this moment. +If admin want to use incompressible page writeback, they could do via:: + + echo huge > /sys/block/zramX/write + +To use idle page writeback, first, user need to declare zram pages +as idle:: + + echo all > /sys/block/zramX/idle + +From now on, any pages on zram are idle pages. The idle mark +will be removed until someone request access of the block. +IOW, unless there is access request, those pages are still idle pages. + +Admin can request writeback of those idle pages at right timing via:: + + echo idle > /sys/block/zramX/writeback + +With the command, zram writeback idle pages from memory to the storage. + +If there are lots of write IO with flash device, potentially, it has +flash wearout problem so that admin needs to design write limitation +to guarantee storage health for entire product life. + +To overcome the concern, zram supports "writeback_limit" feature. +The "writeback_limit_enable"'s default value is 0 so that it doesn't limit +any writeback. IOW, if admin want to apply writeback budget, he should +enable writeback_limit_enable via:: + + $ echo 1 > /sys/block/zramX/writeback_limit_enable + +Once writeback_limit_enable is set, zram doesn't allow any writeback +until admin set the budget via /sys/block/zramX/writeback_limit. + +(If admin doesn't enable writeback_limit_enable, writeback_limit's value +assigned via /sys/block/zramX/writeback_limit is meaninless.) + +If admin want to limit writeback as per-day 400M, he could do it +like below:: + + $ MB_SHIFT=20 + $ 4K_SHIFT=12 + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit. + $ echo 1 > /sys/block/zram0/writeback_limit_enable + +If admin want to allow further write again once the bugdet is exausted, +he could do it like below:: + + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit + +If admin want to see remaining writeback budget since he set:: + + $ cat /sys/block/zramX/writeback_limit + +If admin want to disable writeback limit, he could do:: + + $ echo 0 > /sys/block/zramX/writeback_limit_enable + +The writeback_limit count will reset whenever you reset zram(e.g., +system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of +writeback happened until you reset the zram to allocate extra writeback +budget in next setting is user's job. + +If admin want to measure writeback count in a certain period, he could +know it via /sys/block/zram0/bd_stat's 3rd column. + +memory tracking +=============== + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. + +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: + + 300 75.033841 .wh. + 301 63.806904 s... + 302 63.806919 ..hi + +First column + zram's block index. +Second column + access time since the system was booted +Third column + state of the block: + + s: + same page + w: + written page to backing store + h: + huge page + i: + idle page + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + +Nitin Gupta +ngupta@vflare.org diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 5b63182ceb5f..9228fbf5ce4e 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -73,6 +73,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + blockdev/index ext4 binderfs pm/index diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e645b3ab4b6f..78576aa45cce 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1249,7 +1249,7 @@ See also Documentation/fault-injection/. floppy= [HW] - See Documentation/blockdev/floppy.rst. + See Documentation/admin-guide/blockdev/floppy.rst. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -2234,7 +2234,7 @@ memblock=debug [KNL] Enable memblock debug messages. load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -3268,7 +3268,7 @@ pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/blockdev/paride.rst. + See also Documentation/admin-guide/blockdev/paride.rst. pci=option[,option...] [PCI] various PCI subsystem options. @@ -3512,7 +3512,7 @@ needed on a platform with proper driver support. pd. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -3527,10 +3527,10 @@ and performance comparison. pf. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pg. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.rst. @@ -3642,7 +3642,7 @@ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. psi= [KNL] Enable or disable pressure stall information tracking. @@ -3664,7 +3664,7 @@ pstore.backend= Specify the name of the pstore backend to use pt. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pti= [X86_64] Control Page Table Isolation of user and kernel address spaces. Disabling this feature @@ -3693,7 +3693,7 @@ See Documentation/admin-guide/md.rst. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. random.trust_cpu={on,off} [KNL] Enable or disable trusting the use of the diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg deleted file mode 100644 index f87cfa0dc2fb..000000000000 --- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg +++ /dev/null @@ -1,588 +0,0 @@ - - - - - - Master slide - - - - - - - - - - RSDataReply - - - - - - - CsumRSRequest - - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - - WriteAck - - - - got_BlockAck() - - - Checksum based Resync, case not in sync - - - DRBD-8.3 data flow - - - w_e_send_csum() - - - - - - - - RSIsInSync - - - - - - - CsumRSRequest - - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - got_IsInSync() - - - Checksum based Resync, case in sync - - - - - - - - - - OVReply - - - - - - - OVRequest - - - - receive_OVRequest() - - - drbd_endio_read_sec() - - - w_e_end_ov_req() - - - receive_OVReply() - - - drbd_endio_read_sec() - - - w_e_end_ov_reply() - - - - - - OVResult - - - - got_OVResult() - - - Online verify - - - w_make_ov_request() - - - - - - - - drbd_endio_read_sec() - - - w_make_resync_request() - - - w_e_send_csum() - - - - - drbd_endio_read_sec() - - - - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg deleted file mode 100644 index 48a1e2165fec..000000000000 --- a/Documentation/blockdev/drbd/DRBD-data-packets.svg +++ /dev/null @@ -1,459 +0,0 @@ - - - - - - Master slide - - - - - - - - - RSDataReply - - - - - RSDataRequest - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_rsdata_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - WriteAck - - - got_BlockAck() - - - Resync blocks, 4-32K - - - - - - - WriteAck - - - - - Data - - - drbd_make_request() - - - receive_Data() - - - drbd_endio_write_sec() - - - e_end_block() - - - got_BlockAck() - - - Regular mirrored write, 512-32K - - - w_send_dblock() - - - - - drbd_endio_write_pri() - - - - - - - DataReply - - - - - DataRequest - - - drbd_make_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_data_req() - - - Drawing - - receive_DataReply() - - - - Diskless read, 512-32K - - - w_send_read_req() - - - DRBD 8 data flow - - - - - - al_begin_io() - - - al_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot deleted file mode 100644 index 025e8cf5e64a..000000000000 --- a/Documentation/blockdev/drbd/conn-states-8.dot +++ /dev/null @@ -1,18 +0,0 @@ -digraph conn_states { - StandAllone -> WFConnection [ label = "ioctl_set_net()" ] - WFConnection -> Unconnected [ label = "unable to bind()" ] - WFConnection -> WFReportParams [ label = "in connect() after accept" ] - WFReportParams -> StandAllone [ label = "checks in receive_param()" ] - WFReportParams -> Connected [ label = "in receive_param()" ] - WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] - WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] - WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] - WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] - SyncSource -> Connected - SyncTarget -> Connected - SyncSource -> PausedSyncS - SyncTarget -> PausedSyncT - PausedSyncS -> SyncSource - PausedSyncT -> SyncTarget - Connected -> WFConnection [ label = "* on network error" ] -} diff --git a/Documentation/blockdev/drbd/data-structure-v9.rst b/Documentation/blockdev/drbd/data-structure-v9.rst deleted file mode 100644 index 66036b901644..000000000000 --- a/Documentation/blockdev/drbd/data-structure-v9.rst +++ /dev/null @@ -1,42 +0,0 @@ -================================ -kernel data structure for DRBD-9 -================================ - -This describes the in kernel data structure for DRBD-9. Starting with -Linux v3.14 we are reorganizing DRBD to use this data structure. - -Basic Data Structure -==================== - -A node has a number of DRBD resources. Each such resource has a number of -devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD -device is represented by a block device locally. - -The DRBD objects are interconnected to form a matrix as depicted below; a -drbd_peer_device object sits at each intersection between a drbd_device and a -drbd_connection:: - - /--------------+---------------+.....+---------------\ - | resource | device | | device | - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - +--------------+---------------+.....+---------------+ - : : : : : - : : : : : - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - \--------------+---------------+.....+---------------/ - -In this table, horizontally, devices can be accessed from resources by their -volume number. Likewise, peer_devices can be accessed from connections by -their volume number. Objects in the vertical direction are connected by double -linked lists. There are back pointers from peer_devices to their connections a -devices, and from connections and devices to their resource. - -All resources are in the drbd_resources double-linked list. In addition, all -devices can be accessed by their minor device number via the drbd_devices idr. - -The drbd_resource, drbd_connection, and drbd_device objects are reference -counted. The peer_device objects only serve to establish the links between -devices and connections; their lifetime is determined by the lifetime of the -device and connection which they reference. diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot deleted file mode 100644 index d06cfb46fb98..000000000000 --- a/Documentation/blockdev/drbd/disk-states-8.dot +++ /dev/null @@ -1,16 +0,0 @@ -digraph disk_states { - Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] - Diskless -> Consistent [ label = "ioctl_set_disk()" ] - Diskless -> Outdated [ label = "ioctl_set_disk()" ] - Consistent -> Outdated [ label = "receive_param()" ] - Consistent -> UpToDate [ label = "receive_param()" ] - Consistent -> Inconsistent [ label = "start resync" ] - Outdated -> Inconsistent [ label = "start resync" ] - UpToDate -> Inconsistent [ label = "ioctl_replicate" ] - Inconsistent -> UpToDate [ label = "resync completed" ] - Consistent -> Failed [ label = "io completion error" ] - Outdated -> Failed [ label = "io completion error" ] - UpToDate -> Failed [ label = "io completion error" ] - Inconsistent -> Failed [ label = "io completion error" ] - Failed -> Diskless [ label = "sending notify to peer" ] -} diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot deleted file mode 100644 index 6d9cf0a7b11d..000000000000 --- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot +++ /dev/null @@ -1,85 +0,0 @@ -// vim: set sw=2 sts=2 : -digraph { - rankdir=BT - bgcolor=white - - node [shape=plaintext] - node [fontcolor=black] - - StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] - - node [fontcolor=lightgray] - - Unconnected [ label=Unconnected ] - - CommTrouble [ shape=record, - label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] - - node [fontcolor=gray] - - subgraph cluster_try_connect { - label="try to connect, handshake" - rank=max - WFConnection [ label=WFConnection ] - WFReportParams [ label=WFReportParams ] - } - - TearDown [ label=TearDown ] - - Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] - - node [fontcolor=lightblue] - - StartingSyncS [ label=StartingSyncS ] - StartingSyncT [ label=StartingSyncT ] - - subgraph cluster_bitmap_exchange { - node [fontcolor=red] - fontcolor=red - label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" - - WFBitMapT [ label=WFBitMapT ] - WFSyncUUID [ label=WFSyncUUID ] - WFBitMapS [ label=WFBitMapS ] - } - - node [fontcolor=blue] - - cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] - - node [shape=box,fontcolor=black] - - // drbdadm [label="drbdadm connect"] - // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] - // comm_error [label="communication trouble"] - - // - // edges - // -------------------------------------- - - StandAlone -> Unconnected [ label="drbdadm connect" ] - Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] - Unconnected -> WFConnection [ label="receiver thread is started" ] - WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] - - WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] - WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] - - WFReportParams -> WFBitMapS - WFReportParams -> WFBitMapT - WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] - - WFBitMapS -> cluster_resync:S - WFSyncUUID -> cluster_resync:T - - edge [color=green] - cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] - - edge [color=red] - WFReportParams -> CommTrouble - Connected -> CommTrouble - cluster_resync:any -> CommTrouble - edge [color=black] - CommTrouble -> Unconnected [label="receiver thread is stopped" ] - -} diff --git a/Documentation/blockdev/drbd/figures.rst b/Documentation/blockdev/drbd/figures.rst deleted file mode 100644 index 3e3fd4b8a478..000000000000 --- a/Documentation/blockdev/drbd/figures.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. The here included files are intended to help understand the implementation - -Data flows that Relate some functions, and write packets -======================================================== - -.. kernel-figure:: DRBD-8.3-data-packets.svg - :alt: DRBD-8.3-data-packets.svg - :align: center - -.. kernel-figure:: DRBD-data-packets.svg - :alt: DRBD-data-packets.svg - :align: center - - -Sub graphs of DRBD's state transitions -====================================== - -.. kernel-figure:: conn-states-8.dot - :alt: conn-states-8.dot - :align: center - -.. kernel-figure:: disk-states-8.dot - :alt: disk-states-8.dot - :align: center - -.. kernel-figure:: node-states-8.dot - :alt: node-states-8.dot - :align: center diff --git a/Documentation/blockdev/drbd/index.rst b/Documentation/blockdev/drbd/index.rst deleted file mode 100644 index 68ecd5c113e9..000000000000 --- a/Documentation/blockdev/drbd/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -========================================== -Distributed Replicated Block Device - DRBD -========================================== - -Description -=========== - - DRBD is a shared-nothing, synchronously replicated block device. It - is designed to serve as a building block for high availability - clusters and in this context, is a "drop-in" replacement for shared - storage. Simplistically, you could see it as a network RAID 1. - - Please visit http://www.drbd.org to find out more. - -.. toctree:: - :maxdepth: 1 - - data-structure-v9 - figures diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot deleted file mode 100644 index 4a2b00c23547..000000000000 --- a/Documentation/blockdev/drbd/node-states-8.dot +++ /dev/null @@ -1,14 +0,0 @@ -digraph node_states { - Secondary -> Primary [ label = "ioctl_set_state()" ] - Primary -> Secondary [ label = "ioctl_set_state()" ] -} - -digraph peer_states { - Secondary -> Primary [ label = "recv state packet" ] - Primary -> Secondary [ label = "recv state packet" ] - Primary -> Unknown [ label = "connection lost" ] - Secondary -> Unknown [ label = "connection lost" ] - Unknown -> Primary [ label = "connected" ] - Unknown -> Secondary [ label = "connected" ] -} - diff --git a/Documentation/blockdev/floppy.rst b/Documentation/blockdev/floppy.rst deleted file mode 100644 index 4a8f31cf4139..000000000000 --- a/Documentation/blockdev/floppy.rst +++ /dev/null @@ -1,255 +0,0 @@ -============= -Floppy Driver -============= - -FAQ list: -========= - -A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - -The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - -Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad):: - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9:: - - append = "floppy=thinkpad" - -Several floppy related options may be given, example:: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - -If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - -If you use the floppy driver as a module, use the following syntax:: - - modprobe floppy floppy="" - -Example:: - - modprobe floppy floppy="omnibook messages" - -If you need certain options enabled every time you load the floppy driver, -you can put:: - - options floppy floppy="omnibook messages" - -in a configuration file in /etc/modprobe.d/. - - -The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc / floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook / floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - == ================================== - 0 Use the value of the physical CMOS - 1 5 1/4 DD - 2 5 1/4 HD - 3 3 1/2 DD - 4 3 1/2 HD - 5 3 1/2 ED - 6 3 1/2 ED - 16 unknown or not installed - == ================================== - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts / floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate:: - - PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases. - - -Supporting utilities and additional documentation: -================================================== - -Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - - http://fdutils.linux.lu - -The fdutils releases can be found at: - - http://fdutils.linux.lu/download.html - - http://www.tux.org/pub/knaff/fdutils/ - - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - -If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - -Be sure to read the FAQ before mailing/posting any bug reports! - -Alain - -Changelog -========= - -10-30-2004 : - Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : - Original Document diff --git a/Documentation/blockdev/index.rst b/Documentation/blockdev/index.rst deleted file mode 100644 index a9af6ed8b4aa..000000000000 --- a/Documentation/blockdev/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -:orphan: - -=========================== -The Linux RapidIO Subsystem -=========================== - -.. toctree:: - :maxdepth: 1 - - floppy - nbd - paride - ramdisk - zram - - drbd/index diff --git a/Documentation/blockdev/nbd.rst b/Documentation/blockdev/nbd.rst deleted file mode 100644 index d78dfe559dcf..000000000000 --- a/Documentation/blockdev/nbd.rst +++ /dev/null @@ -1,31 +0,0 @@ -================================== -Network Block Device (TCP version) -================================== - -1) Overview ------------ - -What is it: With this compiled in the kernel (or as a module), Linux -can use a remote server as one of its block devices. So every time -the client computer wants to read, e.g., /dev/nb0, it sends a -request over TCP to the server, which will reply with the data read. -This can be used for stations with low disk space (or even diskless) -to borrow disk space from another computer. -Unlike NFS, it is possible to put any filesystem on it, etc. - -For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. - -The nbd kernel module need only be installed on the client -system, as the nbd-server is completely in userspace. In fact, -the nbd-server has been successfully ported to other operating -systems, including Windows. - -A) NBD parameters ------------------ - -max_part - Number of partitions per device (default: 0). - -nbds_max - Number of block devices that should be initialized (default: 16). diff --git a/Documentation/blockdev/paride.rst b/Documentation/blockdev/paride.rst deleted file mode 100644 index 87b4278bf314..000000000000 --- a/Documentation/blockdev/paride.rst +++ /dev/null @@ -1,439 +0,0 @@ -=================================== -Linux and parallel port IDE devices -=================================== - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction -=============== - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - - MicroSolutions backpack CD-ROM - - MicroSolutions backpack PD/CD - - MicroSolutions backpack hard-drives - - MicroSolutions backpack 8000t tape drive - - SyQuest EZ-135, EZ-230 & SparQ drives - - Avatar Shark - - Imation Superdisk LS-120 - - Maxell Superdisk LS-120 - - FreeCom Power CD - - Hewlett-Packard 5GB and 8GB tape drives - - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - === ============= - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - === ============= - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - ==== ====================================== ==== - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - ==== ====================================== ==== - - -2. Using the PARIDE subsystem -============================= - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - ================ ============ ====== ======== - Manufacturer Model Driver Protocol - ================ ============ ====== ======== - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - ================ ============ ====== ======== - -2.1 Configuring built-in drivers ---------------------------------- - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like:: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command:: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules ----------------------------------------------- - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: - using these drivers with the "kerneld" automatic module loading - system is not recommended for beginners, and is not documented here. - -Note 2: - if you build PARPORT support as a loadable module, PARIDE must - also be built as loadable modules, and PARPORT must be loaded before - the PARIDE modules. - -To use PARIDE, you must begin by:: - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example:: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command:: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378:: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device --------------------------- - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute:: - - #!/bin/bash - # - # mkd -- a script to create the device special files for the PARIDE subsystem - # - function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 - } - # - function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done - } - # - cd /dev - # - for u in 0 1 2 3 ; do pd $u ; done - for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done - for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done - for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done - for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done - for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done - # - # end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like:: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system:: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver ------------------- - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver ------------------------- - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver ------------------------- - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting -================== - -3.1 Use EPP mode if you can ----------------------------- - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay -------------------------- - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset -------------------------------------- - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing:: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help ------------------------------------------------------- - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in:: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help ---------------------------------- - -You can join the linux-parport mailing list by sending a mail message -to: - - linux-parport-request@torque.net - -with the single word:: - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/blockdev/ramdisk.rst b/Documentation/blockdev/ramdisk.rst deleted file mode 100644 index b7c2268f8dec..000000000000 --- a/Documentation/blockdev/ramdisk.rst +++ /dev/null @@ -1,177 +0,0 @@ -========================================== -Using the RAM disk block device with Linux -========================================== - -.. Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Parameters ---------------------------------- - -2a) Kernel Command Line Parameters - - ramdisk_size=N - Size of the ramdisk. - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB). - -2b) Module parameters - - rd_nr - /dev/ramX devices created. - - max_part - Maximum partition number. - - rd_size - See ramdisk_size. - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below:: - - ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF - ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 - ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do:: - - /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use:: - - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" - -Since the default start = 0 and the default prompt = 1, you could use:: - - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ------------------------------------------------ - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create:: - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example:: - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing:: - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy:: - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB):: - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552:: - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - - - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : - Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : - Original Document diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst deleted file mode 100644 index 6eccf13219ff..000000000000 --- a/Documentation/blockdev/zram.rst +++ /dev/null @@ -1,422 +0,0 @@ -======================================== -zram: Compressed RAM based block devices -======================================== - -Introduction -============ - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -Usage -===== - -There are several ways to configure and manage zram device(-s): - -a) using zram and zram_control sysfs attributes -b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). - -In this document we will describe only 'manual' zram configuration steps, -IOW, zram and zram_control sysfs attributes. - -In order to get a better idea about zramctl please consult util-linux -documentation, zramctl man-page or `zramctl --help`. Please be informed -that zram maintainers do not develop/maintain util-linux or zramctl, should -you have any questions please contact util-linux@vger.kernel.org - -Following shows a typical sequence of steps for using zram. - -WARNING -======= - -For the sake of simplicity we skip error checking parts in most of the -examples below. However, it is your sole responsibility to handle errors. - -zram sysfs attributes always return negative values in case of errors. -The list of possible return codes: - -======== ============================================================= --EBUSY an attempt to modify an attribute that cannot be changed once - the device has been initialised. Please reset device first; --ENOMEM zram was not able to allocate enough memory to fulfil your - needs; --EINVAL invalid input has been provided. -======== ============================================================= - -If you use 'echo', the returned value that is changed by 'echo' utility, -and, in general case, something like:: - - echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; - handle_error - fi - -should suffice. - -1) Load Module -============== - -:: - - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - -num_devices parameter is optional and tells zram how many devices should be -pre-created. Default: 1. - -2) Set max number of compression streams -======================================== - -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus -allowing several concurrent compression operations. The number of -allocated compression streams goes down when some of the CPUs -become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. - -To find out how many streams are currently available:: - - cat /sys/block/zram0/max_comp_streams - -3) Select compression algorithm -=============================== - -Using comp_algorithm device attribute one can see available and -currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised -there is no way to change compression algorithm). - -Examples:: - - #show supported compression algorithms - cat /sys/block/zram0/comp_algorithm - lzo [lz4] - - #select lzo compression algorithm - echo lzo > /sys/block/zram0/comp_algorithm - -For the time being, the `comp_algorithm` content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). - -4) Set Disksize -=============== - -Set disk size by writing the value to sysfs node 'disksize'. -The value can be either in bytes or you can use mem suffixes. -Examples:: - - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -Note: -There is little point creating a zram of greater than twice the size of memory -since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the -size of the disk when not in use so a huge zram is wasteful. - -5) Set memory limit: Optional -============================= - -Set memory limit by writing the value to sysfs node 'mem_limit'. -The value can be either in bytes or you can use mem suffixes. -In addition, you could change the value in runtime. -Examples:: - - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit - -6) Activate -=========== - -:: - - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -7) Add/remove zram devices -========================== - -zram provides a control interface, which enables dynamic (on-demand) device -addition and removal. - -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. - -Example:: - - cat /sys/class/zram-control/hot_add - 1 - -To remove the existing /dev/zramX device (where X is a device id) -execute:: - - echo X > /sys/class/zram-control/hot_remove - -8) Stats -======== - -Per-device statistics are exported as various nodes under /sys/block/zram/ - -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. - -====================== ====== =============================================== -Name access description -====================== ====== =============================================== -disksize RW show and set the device's disk size -initstate RO shows the initialization state of the device -reset WO trigger device reset -mem_used_max WO reset the `mem_used_max` counter (see later) -mem_limit WO specifies the maximum amount of memory ZRAM can - use to store the compressed data -writeback_limit WO specifies the maximum amount of write IO zram - can write out to backing device as 4KB unit -writeback_limit_enable RW show and set writeback_limit feature -max_comp_streams RW the number of possible concurrent compress - operations -comp_algorithm RW show and change the compression algorithm -compact WO trigger memory compaction -debug_stat RO this file is used for zram debugging purposes -backing_dev RW set up backend storage for zram to write out -idle WO mark allocated slot as idle -====================== ====== =============================================== - - -User space is advised to use the following files to read the device statistics. - -File /sys/block/zram/stat - -Represents block layer statistics. Read Documentation/block/stat.rst for -details. - -File /sys/block/zram/io_stat - -The stat file represents device's I/O statistics not accounted by block -layer and, thus, not available in zram/stat file. It consists of a -single line of text and contains the following stats separated by -whitespace: - - ============= ============================================================= - failed_reads The number of failed reads - failed_writes The number of failed writes - invalid_io The number of non-page-size-aligned I/O requests - notify_free Depending on device usage scenario it may account - - a) the number of pages freed because of swap slot free - notifications - b) the number of pages freed because of - REQ_OP_DISCARD requests sent by bio. The former ones are - sent to a swap block device when a swap slot is freed, - which implies that this disk is being used as a swap disk. - - The latter ones are sent by filesystem mounted with - discard option, whenever some data blocks are getting - discarded. - ============= ============================================================= - -File /sys/block/zram/mm_stat - -The stat file represents device's mm statistics. It consists of a single -line of text and contains the following stats separated by whitespace: - - ================ ============================================================= - orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. - Unit: bytes - compr_data_size compressed size of data stored in this disk - mem_used_total the amount of memory allocated for this disk. This - includes allocator fragmentation and metadata overhead, - allocated for this disk. So, allocator space efficiency - can be calculated using compr_data_size and this statistic. - Unit: bytes - mem_limit the maximum amount of memory ZRAM can use to store - the compressed data - mem_used_max the maximum amount of memory zram have consumed to - store the data - same_pages the number of same element filled pages written to this disk. - No memory is allocated for such pages. - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - ================ ============================================================= - -File /sys/block/zram/bd_stat - -The stat file represents device's backing device statistics. It consists of -a single line of text and contains the following stats separated by whitespace: - - ============== ============================================================= - bd_count size of data written in backing device. - Unit: 4K bytes - bd_reads the number of reads from backing device - Unit: 4K bytes - bd_writes the number of writes to backing device - Unit: 4K bytes - ============== ============================================================= - -9) Deactivate -============= - -:: - - swapoff /dev/zram0 - umount /dev/zram1 - -10) Reset -========= - - Write any positive value to 'reset' sysfs node:: - - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Optional Feature -================ - -writeback ---------- - -With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page -to backing storage rather than keeping it in memory. -To use the feature, admin should set up backing device via:: - - echo /dev/sda5 > /sys/block/zramX/backing_dev - -before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via:: - - echo huge > /sys/block/zramX/write - -To use idle page writeback, first, user need to declare zram pages -as idle:: - - echo all > /sys/block/zramX/idle - -From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. -IOW, unless there is access request, those pages are still idle pages. - -Admin can request writeback of those idle pages at right timing via:: - - echo idle > /sys/block/zramX/writeback - -With the command, zram writeback idle pages from memory to the storage. - -If there are lots of write IO with flash device, potentially, it has -flash wearout problem so that admin needs to design write limitation -to guarantee storage health for entire product life. - -To overcome the concern, zram supports "writeback_limit" feature. -The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should -enable writeback_limit_enable via:: - - $ echo 1 > /sys/block/zramX/writeback_limit_enable - -Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. - -(If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) - -If admin want to limit writeback as per-day 400M, he could do it -like below:: - - $ MB_SHIFT=20 - $ 4K_SHIFT=12 - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit. - $ echo 1 > /sys/block/zram0/writeback_limit_enable - -If admin want to allow further write again once the bugdet is exausted, -he could do it like below:: - - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit - -If admin want to see remaining writeback budget since he set:: - - $ cat /sys/block/zramX/writeback_limit - -If admin want to disable writeback limit, he could do:: - - $ echo 0 > /sys/block/zramX/writeback_limit_enable - -The writeback_limit count will reset whenever you reset zram(e.g., -system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of -writeback happened until you reset the zram to allocate extra writeback -budget in next setting is user's job. - -If admin want to measure writeback count in a certain period, he could -know it via /sys/block/zram0/bd_stat's 3rd column. - -memory tracking -=============== - -With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the -zram block. It could be useful to catch cold or incompressible -pages of the process with*pagemap. - -If you enable the feature, you could see block state via -/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: - - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi - -First column - zram's block index. -Second column - access time since the system was booted -Third column - state of the block: - - s: - same page - w: - written page to backing store - h: - huge page - i: - idle page - -First line of above example says 300th block is accessed at 75.033841sec -and the block's state is huge so it is written back to the backing -storage. It's a debugging feature so anyone shouldn't rely on it to work -properly. - -Nitin Gupta -ngupta@vflare.org diff --git a/MAINTAINERS b/MAINTAINERS index b36028f43192..699596d931c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5006,7 +5006,7 @@ T: git git://git.linbit.com/drbd-8.4.git S: Supported F: drivers/block/drbd/ F: lib/lru_cache.c -F: Documentation/blockdev/drbd/ +F: Documentation/admin-guide/blockdev/ DRIVER CORE, KOBJECTS, DEBUGFS AND SYSFS M: Greg Kroah-Hartman @@ -11076,7 +11076,7 @@ M: Josef Bacik S: Maintained L: linux-block@vger.kernel.org L: nbd@other.debian.org -F: Documentation/blockdev/nbd.rst +F: Documentation/admin-guide/blockdev/nbd.rst F: drivers/block/nbd.c F: include/trace/events/nbd.h F: include/uapi/linux/nbd.h @@ -12086,7 +12086,7 @@ PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh L: linux-parport@lists.infradead.org (subscribers-only) S: Maintained -F: Documentation/blockdev/paride.rst +F: Documentation/admin-guide/blockdev/paride.rst F: drivers/block/paride/ PARISC ARCHITECTURE @@ -13367,7 +13367,7 @@ F: drivers/net/wireless/ralink/rt2x00/ RAMDISK RAM BLOCK DEVICE DRIVER M: Jens Axboe S: Maintained -F: Documentation/blockdev/ramdisk.rst +F: Documentation/admin-guide/blockdev/ramdisk.rst F: drivers/block/brd.c RANCHU VIRTUAL BOARD FOR MIPS @@ -17723,7 +17723,7 @@ R: Sergey Senozhatsky L: linux-kernel@vger.kernel.org S: Maintained F: drivers/block/zram/ -F: Documentation/blockdev/zram.rst +F: Documentation/admin-guide/blockdev/zram.rst ZS DECSTATION Z85C30 SERIAL DRIVER M: "Maciej W. Rozycki" diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index c43690b973d8..1bb8ec575352 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -31,7 +31,7 @@ config BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, say Y. Information about this driver, especially important for IBM Thinkpad users, is contained in - . + . That file also contains the location of the Floppy driver FAQ as well as location of the fdutils package used to configure additional parameters of the driver at run time. @@ -96,7 +96,7 @@ config PARIDE your computer's parallel port. Most of them are actually IDE devices using a parallel port IDE adapter. This option enables the PARIDE subsystem which contains drivers for many of these external drives. - Read for more information. + Read for more information. If you have said Y to the "Parallel-port support" configuration option, you may share a single port between your printer and other @@ -261,7 +261,7 @@ config BLK_DEV_NBD userland (making server and client physically the same computer, communicating using the loopback network device). - Read for more information, + Read for more information, especially about where to find the server code, which runs in user space and does not need special kernel support. @@ -303,7 +303,7 @@ config BLK_DEV_RAM during the initial install of Linux. Note that the kernel command line option "ramdisk=XX" is now obsolete. - For details, read . + For details, read . To compile this driver as a module, choose M here: the module will be called brd. An alias "rd" has been defined diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5c99e52f9dc1..f652c1ac3ae9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4424,7 +4424,7 @@ static int __init floppy_setup(char *str) pr_cont("\n"); } else DPRINT("botched floppy option\n"); - DPRINT("Read Documentation/blockdev/floppy.rst\n"); + DPRINT("Read Documentation/admin-guide/blockdev/floppy.rst\n"); return 0; } diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index e06b99d54816..fe7a4b7d30cf 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -12,7 +12,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" @@ -26,7 +26,7 @@ config ZRAM_WRITEBACK With /sys/block/zramX/{idle,writeback}, application could ask idle page's writeback to the backing device to save in memory. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. config ZRAM_MEMORY_TRACKING bool "Track zRam block status" @@ -36,4 +36,4 @@ config ZRAM_MEMORY_TRACKING of zRAM. Admin could see the information via /sys/kernel/debug/zram/zramX/block_state. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 5fa378391d3b..110b34834a6f 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -37,4 +37,4 @@ Commands required for testing: - mkfs/ mkfs.ext4 For more information please refer: -kernel-source-tree/Documentation/blockdev/zram.rst +kernel-source-tree/Documentation/admin-guide/blockdev/zram.rst -- cgit v1.2.3 From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 13:08:35 -0300 Subject: docs: cgroup-v1: add it to the admin-guide book Those files belong to the admin guide, so add them. Signed-off-by: Mauro Carvalho Chehab --- .../admin-guide/cgroup-v1/blkio-controller.rst | 302 ++++++ Documentation/admin-guide/cgroup-v1/cgroups.rst | 695 ++++++++++++++ Documentation/admin-guide/cgroup-v1/cpuacct.rst | 50 + Documentation/admin-guide/cgroup-v1/cpusets.rst | 866 +++++++++++++++++ Documentation/admin-guide/cgroup-v1/devices.rst | 132 +++ .../admin-guide/cgroup-v1/freezer-subsystem.rst | 127 +++ Documentation/admin-guide/cgroup-v1/hugetlb.rst | 50 + Documentation/admin-guide/cgroup-v1/index.rst | 28 + Documentation/admin-guide/cgroup-v1/memcg_test.rst | 355 +++++++ Documentation/admin-guide/cgroup-v1/memory.rst | 1003 ++++++++++++++++++++ Documentation/admin-guide/cgroup-v1/net_cls.rst | 44 + Documentation/admin-guide/cgroup-v1/net_prio.rst | 57 ++ Documentation/admin-guide/cgroup-v1/pids.rst | 92 ++ Documentation/admin-guide/cgroup-v1/rdma.rst | 117 +++ Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 4 +- .../admin-guide/mm/numa_memory_policy.rst | 2 +- Documentation/block/bfq-iosched.rst | 2 +- Documentation/cgroup-v1/blkio-controller.rst | 302 ------ Documentation/cgroup-v1/cgroups.rst | 695 -------------- Documentation/cgroup-v1/cpuacct.rst | 50 - Documentation/cgroup-v1/cpusets.rst | 866 ----------------- Documentation/cgroup-v1/devices.rst | 132 --- Documentation/cgroup-v1/freezer-subsystem.rst | 127 --- Documentation/cgroup-v1/hugetlb.rst | 50 - Documentation/cgroup-v1/index.rst | 30 - Documentation/cgroup-v1/memcg_test.rst | 355 ------- Documentation/cgroup-v1/memory.rst | 1003 -------------------- Documentation/cgroup-v1/net_cls.rst | 44 - Documentation/cgroup-v1/net_prio.rst | 57 -- Documentation/cgroup-v1/pids.rst | 92 -- Documentation/cgroup-v1/rdma.rst | 117 --- Documentation/filesystems/tmpfs.txt | 2 +- Documentation/kernel-per-CPU-kthreads.txt | 2 +- Documentation/scheduler/sched-deadline.rst | 2 +- Documentation/scheduler/sched-design-CFS.rst | 2 +- Documentation/scheduler/sched-rt-group.rst | 2 +- Documentation/vm/numa.rst | 4 +- Documentation/vm/page_migration.rst | 2 +- Documentation/vm/unevictable-lru.rst | 2 +- Documentation/x86/x86_64/fake-numa-for-cpusets.rst | 4 +- MAINTAINERS | 4 +- block/Kconfig | 2 +- include/linux/cgroup-defs.h | 2 +- include/uapi/linux/bpf.h | 2 +- init/Kconfig | 4 +- kernel/cgroup/cpuset.c | 2 +- security/device_cgroup.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 50 files changed, 3945 insertions(+), 3946 deletions(-) create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst delete mode 100644 Documentation/cgroup-v1/cgroups.rst delete mode 100644 Documentation/cgroup-v1/cpuacct.rst delete mode 100644 Documentation/cgroup-v1/cpusets.rst delete mode 100644 Documentation/cgroup-v1/devices.rst delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst delete mode 100644 Documentation/cgroup-v1/hugetlb.rst delete mode 100644 Documentation/cgroup-v1/index.rst delete mode 100644 Documentation/cgroup-v1/memcg_test.rst delete mode 100644 Documentation/cgroup-v1/memory.rst delete mode 100644 Documentation/cgroup-v1/net_cls.rst delete mode 100644 Documentation/cgroup-v1/net_prio.rst delete mode 100644 Documentation/cgroup-v1/pids.rst delete mode 100644 Documentation/cgroup-v1/rdma.rst (limited to 'tools') diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst new file mode 100644 index 000000000000..1d7d962933be --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst @@ -0,0 +1,302 @@ +=================== +Block IO Controller +=================== + +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +One IO control policy is throttling policy which can be used to +specify upper IO rate limits on devices. This policy is implemented in +generic block layer and can be used on leaf nodes as well as higher +level logical devices like device mapper. + +HOWTO +===== +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller:: + + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer:: + + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: + + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ":: + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not:: + + # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Throttling implements hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows:: + + root + / \ + test1 test2 + | + test3 + +Throttling with "sane_behavior" will handle the +hierarchy correctly. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following:: + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_BFQ_CGROUP_DEBUG + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format:: + + # echo dev_maj:dev_minor weight > blkio.weight_device + + Configure weight=300 on /dev/sdb (8:16) in this cgroup:: + + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup:: + + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup:: + + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst new file mode 100644 index 000000000000..b0688011ed06 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst @@ -0,0 +1,695 @@ +============== +Control Groups +============== + +Written by Paul Menage based on +Documentation/admin-guide/cgroup-v1/cpusets.rst + +Original copyright statements from cpusets.txt: + +Portions Copyright (C) 2004 BULL SA. + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. + +Modified by Paul Jackson + +Modified by Christoph Lameter + +.. CONTENTS: + + 1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name + 3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API + 4. Extended attributes usage + 5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines:: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can:: + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of:: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like:: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup:: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type:: + + # mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?` you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group:: + + # mount -t tmpfs cgroup_root /sys/fs/cgroup + # mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type:: + + # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent:: + + # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent:: + + # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1:: + + # cd /sys/fs/cgroup/rg1 + # mkdir my_cgroup + +Now you want to do something with this cgroup: + + # cd my_cgroup + +In this directory you can find several files:: + + # ls + cgroup.procs notify_on_release tasks + (plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup:: + + # /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cgroup, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0:: + + # echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _cgrp_subsys + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +``int css_online(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +``void css_offline(struct cgroup *cgrp);`` +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +``void css_free(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +``void css_reset(struct cgroup_subsys_state *css)`` +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +``void fork(struct task_struct *task)`` + +Called when a task is forked into a cgroup. + +``void exit(struct task_struct *task)`` + +Called during task exit. + +``void free(struct task_struct *task)`` + +Called when the task_struct is freed. + +``void bind(struct cgroup *root)`` +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +:: + + Q: what's up with this '/bin/echo' ? + A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + + Q: When I attach processes, only the first of the line gets really attached ! + A: We can only return one error code per call to write(). So you should also + put only ONE PID. diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst new file mode 100644 index 000000000000..d30ed81d2ad7 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst @@ -0,0 +1,50 @@ +========================= +CPU Accounting Controller +========================= + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst new file mode 100644 index 000000000000..86a6ae995d54 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -0,0 +1,866 @@ +======= +CPUSETS +======= + +Copyright (C) 2004 BULL SA. + +Written by Simon.Derr@bull.net + +- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +- Modified by Paul Jackson +- Modified by Christoph Lameter +- Modified by Paul Menage +- Modified by Hidetoshi Seto + +.. CONTENTS: + + 1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes + 3. Questions + 4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/admin-guide/cgroup-v1/cgroups.rst. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example:: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> + Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + +====== =========================================================== + -1 no request. use system default or follow request of others. + 0 no search. + 1 search siblings (hyperthreads in a core). + 2 search cores in a package. + 3 search cpus in a node [= system wide on non-NUMA system] + 4 search nodes in a chunk of node [on NUMA system] + 5 search system wide [on NUMA system] +====== =========================================================== + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. + then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpuset's 'tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset:: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: + + # cd /sys/fs/cgroup/cpuset + # mkdir my_cpuset + +Now you want to do something with this cpuset:: + + # cd my_cpuset + +In this directory you can find several files:: + + # ls + cgroup.clone_children cpuset.memory_pressure + cgroup.event_control cpuset.memory_spread_page + cgroup.procs cpuset.memory_spread_slab + cpuset.cpu_exclusive cpuset.mems + cpuset.cpus cpuset.sched_load_balance + cpuset.mem_exclusive cpuset.sched_relax_domain_level + cpuset.mem_hardwall notify_on_release + cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags:: + + # /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus:: + + # /bin/echo 0-7 > cpuset.cpus + +Add some mems:: + + # /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset:: + + # /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cpuset, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command:: + + mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to:: + + mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset + echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories:: + + # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset:: + + # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs:: + + # /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple:: + + # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' + # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: + what's up with this '/bin/echo' ? + +A: + bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: + When I attach processes, only the first of the line gets really attached ! + +A: + We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst new file mode 100644 index 000000000000..e1886783961e --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/devices.rst @@ -0,0 +1,132 @@ +=========================== +Device Whitelist Controller +=========================== + +1. Description +============== + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface +================= + +An entry is added using devices.allow, and removed using +devices.deny. For instance:: + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing:: + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing:: + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security +=========== + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy +============ + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example:: + + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A:: + + # echo "c 116:* r" > A/devices.deny + +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed:: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated:: + + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding ``c *:3 rwm``:: + + # echo "c *:3 rwm" >A/devices.allow + +the result:: + + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B:: + + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow + +or even:: + + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) +--------------------------------------- + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst new file mode 100644 index 000000000000..582d3427de3f --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst @@ -0,0 +1,127 @@ +============== +Cgroup Freezer +============== + +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells:: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage:: + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem:: + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container:: + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container:: + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst new file mode 100644 index 000000000000..a3902aa253a9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -0,0 +1,50 @@ +================== +HugeTLB Controller +================== + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files:: + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting three hugepage sizes (64k, 32M and 1G), the control +files include:: + + hugetlb.1GB.limit_in_bytes + hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.usage_in_bytes + hugetlb.1GB.failcnt + hugetlb.64KB.limit_in_bytes + hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.usage_in_bytes + hugetlb.64KB.failcnt + hugetlb.32MB.limit_in_bytes + hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.usage_in_bytes + hugetlb.32MB.failcnt diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst new file mode 100644 index 000000000000..10bf48bae0b0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/index.rst @@ -0,0 +1,28 @@ +======================== +Control Groups version 1 +======================== + +.. toctree:: + :maxdepth: 1 + + cgroups + + blkio-controller + cpuacct + cpusets + devices + freezer-subsystem + hugetlb + memcg_test + memory + net_cls + net_prio + pids + rdma + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst new file mode 100644 index 000000000000..3f7115e07b5d --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst @@ -0,0 +1,355 @@ +===================================================== +Memory Resource Controller(Memcg) Implementation Memo +===================================================== + +Last Updated: 2010/2 + +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst) + +0. How to record usage ? +======================== + + 2 objects are used. + + page_cgroup ....an object per page. + + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge +========= + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge +=========== + + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel +======================= + + Memcg pages are charged in two steps: + + - mem_cgroup_try_charge() + - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous +============ + + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache +============= + + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + + Note: + __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache +=========================== + + The best way to understand shmem's page state transition is to read + mm/shmem.c. + + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration +================= + + mem_cgroup_migrate() + +8. LRU +====== + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global pgdat->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under pgdat->lru_lock. + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. +================= + + Tests for racy cases. + +9.1 Small limit to memcg. +------------------------- + + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + +9.2 Shmem +--------- + + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + +9.3 Migration +------------- + + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration:: + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset.:: + + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + +9.4 Memory hotplug +------------------ + + memory hotplug test is one of good test. + + to offline memory, do following:: + + # echo offline > /sys/devices/system/memory/memoryXXX/state + + (XXX is the place of memory) + + This is an easy way to test page migration, too. + +9.5 mkdir/rmdir +--------------- + + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following:: + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running:: + + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + +9.6 Mount with other subsystems +------------------------------- + + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example:: + + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + +9.7 swapoff +----------- + + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good: + + (Shell-A):: + + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + + Run malloc(100M) program under this. You'll see 60M of swaps. + + (Shell-B):: + + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + +9.8 OOM-Killer +-------------- + + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + + Case A) when you can swapoff:: + + #swapoff -a + #echo 50M > /memory.limit_in_bytes + + run 51M of malloc + + Case B) when you use mem+swap limitation:: + + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + + run 51M of malloc + +9.9 Move charges at task migration +---------------------------------- + + Charges associated with a task can be moved along with task migration. + + (Shell-A):: + + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B):: + + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading ``*.usage_in_bytes`` or + memory.stat of both A and B. + + See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should + be written to move_charge_at_immigrate. + +9.10 Memory thresholds +---------------------- + + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener:: + + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory:: + + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst new file mode 100644 index 000000000000..41bdc038dad9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -0,0 +1,1003 @@ +========================== +Memory Resource Controller +========================== + +NOTE: + This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: + The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller +============================================= + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + +==================================== ========================================== + tasks attach a task(thread) and show list of + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes set/show limit of memory usage + memory.memsw.limit_in_bytes set/show limit of memory+Swap usage + memory.failcnt show the number of memory usage hits limits + memory.memsw.failcnt show the number of memory+Swap hits limits + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + memory.force_empty trigger forced page reclaim + memory.pressure_level set memory pressure notifications + memory.swappiness set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate set/show controls of moving charges + memory.oom_control set/show oom controls. + memory.numa_stat show the number of memory usage per numa + node + + memory.kmem.limit_in_bytes set/show hard limit for kernel memory + memory.kmem.usage_in_bytes show current kernel memory allocation + memory.kmem.failcnt show the number of kernel memory usage + hits limits + memory.kmem.max_usage_in_bytes show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation + memory.kmem.tcp.failcnt show the number of tcp buf memory usage + hits limits + memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded +==================================== ========================================== + +1. History +========== + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control +================= + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design +----------- + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting +--------------- + +:: + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details +------------------------ + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting +-------------------------- + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) +-------------------------------------- + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +**why 'memory+swap' rather than swap** + +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +**What happens when a cgroup hits memory.memsw.limit_in_bytes** + +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim +----------- + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: + Reclaim does not work for the root cgroup, since we cannot set any + limits on the root cgroup. + +Note2: + When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking +----------- + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + the i_pages lock. + + Other lock order is following: + + PG_locked. + mm->page_table_lock + pgdat->lru_lock + lock_page_cgroup. + + In many cases, just lock_page_cgroup() is called. + + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + pgdat->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +----------------------------------------------- + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory accounting is enabled for all memory cgroups by default. But +it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel +at boot time. In this case, kernel memory will not be accounted at all. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). + +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted +----------------------------------------------- + +stack pages: + every process consumes some stack pages. By accounting into + kernel memory, we prevent new processes from being created when the kernel + memory usage is too high. + +slab pages: + pages allocated by the SLAB or SLUB allocator are tracked. A copy + of each kmem_cache is created every time the cache is touched by the first time + from inside the memcg. The creation is done lazily, so some objects can still be + skipped while the cache is being created. All objects in a slab page should + belong to the same memcg. This only fails to hold when a task is migrated to a + different memcg during the page allocation by the cache. + +sockets memory pressure: + some sockets protocols have memory pressure + thresholds. The Memory Controller allows them to be controlled individually + per cgroup, instead of globally. + +tcp memory pressure: + sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases +---------------------- + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + +U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + +U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + +WARNING: + In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + +U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface +================= + +3.0. Configuration +------------------ + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +------------------------------------------------------------------- + +:: + + # mount -t tmpfs none /sys/fs/cgroup + # mkdir /sys/fs/cgroup/memory + # mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it:: + + # mkdir /sys/fs/cgroup/memory/0 + # echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit:: + + # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: + We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, + mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, + Gibibytes.) + +NOTE: + We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. + +NOTE: + We cannot set limits on the root cgroup any more. + +:: + + # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes + 4194304 + +We can check the usage:: + + # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes + 1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel:: + + # echo 1 > memory.limit_in_bytes + # cat memory.limit_in_bytes + 4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing +========== + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting +------------------- + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration +------------------ + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup +--------------------- + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces +=================== + +5.1 force_empty +--------------- + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this:: + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Though rmdir() offlines memcg, but the memcg may still stay there due to + charged file caches. Some out-of-use page caches may keep charged until + memory pressure happens. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file +------------- + +memory.stat file includes following statistics + +per-memory cgroup local status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +=============== =============================================================== +cache # of bytes of page cache memory. +rss # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge # of bytes of anonymous transparent hugepages. +mapped_file # of bytes of mapped file (includes tmpfs/shmem) +pgpgin # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap # of bytes of swap usage +dirty # of bytes that are waiting to get written back to the disk. +writeback # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file # of bytes of file-backed memory on inactive LRU list. +active_file # of bytes of file-backed memory on active LRU list. +unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). +=============== =============================================================== + +status considering hierarchy (see memory.use_hierarchy settings) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= =================================================== +hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache +========================= =================================================== + +The following additional stats are dependent on CONFIG_DEBUG_VM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= ======================================== +recent_rotated_anon VM internal parameter. (see mm/vmscan.c) +recent_rotated_file VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon VM internal parameter. (see mm/vmscan.c) +recent_scanned_file VM internal parameter. (see mm/vmscan.c) +========================= ======================================== + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + + 'rss + mapped_file" will give you resident set size of cgroup. + + (Note: file and shmem may be shared among other cgroups. In that case, + mapped_file is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness +-------------- + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt +----------- + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file:: + + # echo 0 > .../memory.failcnt + +5.5 usage_in_bytes +------------------ + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat +------------- + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is:: + + total= N0= N1= ... + file= N0= N1= ... + anon= N0= N1= ... + unevictable= N0= N1= ... + hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support +==================== + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy:: + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim +------------------------------------------------ + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: + + # echo 1 > memory.use_hierarchy + +The feature can be disabled by:: + + # echo 0 > memory.use_hierarchy + +NOTE1: + Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: + When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits +============== + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface +------------- + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB):: + + # echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use:: + + # echo 1G > memory.soft_limit_in_bytes + +NOTE1: + Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: + It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration +================================= + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface +------------- + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it:: + + # echo (some positive value) > memory.move_charge_at_immigrate + +Note: + Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: + Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: + If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: + It can take several seconds if you move charges much. + +And if you want disable it again:: + + # echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved +-------------------------------------- + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + ++---+--------------------------------------------------------------------------+ +|bit| what type of charges would be moved ? | ++===+==========================================================================+ +| 0 | A charge of an anonymous page (or swap of it) used by the target task. | +| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | ++---+--------------------------------------------------------------------------+ +| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | +| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | +| | anonymous pages, file pages (and swaps) in the range mmapped by the task | +| | will be moved even if the task hasn't done page fault, i.e. they might | +| | not be the task's "RSS", but other task's "RSS" that maps the same file. | +| | And mapcount of the page is ignored (the page can be moved even if | +| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | +| | enable move of swap charges. | ++---+--------------------------------------------------------------------------+ + +8.3 TODO +-------- + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds +==================== + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: + +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control +=============== + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + + * enlarge limit or reduce usage. + +To reduce usage, + + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + + - oom_kill_disable 0 or 1 + (if 1, oom-killer is disabled) + - under_oom 0 or 1 + (if 1, the memory cgroup is under OOM, tasks may be stopped.) + +11. Memory Pressure +=================== + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +By default, events are propagated upward until the event is handled, i.e. the +events are not pass-through. For example, you have three cgroups: A->B->C. Now +you set up an event listener on cgroups A, B and C, and suppose group C +experiences some pressure. In this situation, only group C will receive the +notification, i.e. groups A and B will not receive it. This is done to avoid +excessive "broadcasting" of messages, which disturbs the system and which is +especially bad if we are low on memory or thrashing. Group B, will receive +notification only if there are no event listers for group C. + +There are three optional modes that specify different propagation behavior: + + - "default": this is the default behavior specified above. This mode is the + same as omitting the optional mode parameter, preserved by backwards + compatibility. + + - "hierarchy": events always propagate up to the root, similar to the default + behavior, except that propagation continues regardless of whether there are + event listeners at each level, with the "hierarchy" mode. In the above + example, groups A, B, and C will receive notification of memory pressure. + + - "local": events are pass-through, i.e. they only receive notifications when + memory pressure is experienced in the memcg for which the notification is + registered. In the above example, group C will receive notification if + registered for "local" notification and the group experiences memory + pressure. However, group B will never receive notification, regardless if + there is an event listener for group C or not, if group B is registered for + local notification. + +The level and event notification mode ("hierarchy" or "local", if necessary) are +specified by a comma-delimited string, i.e. "low,hierarchy" specifies +hierarchical, pass-through, notification for all ancestor memcgs. Notification +that is the default, non pass-through behavior, does not specify a mode. +"medium,local" specifies pass-through notification for the medium level. + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string as " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure:: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low,hierarchy & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO +======== + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary +======= + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References +========== + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst new file mode 100644 index 000000000000..a2cf272af7a0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst @@ -0,0 +1,44 @@ +========================= +Network classifier cgroup +========================= + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example:: + + mkdir /sys/fs/cgroup/net_cls + mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls + mkdir /sys/fs/cgroup/net_cls/0 + echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + +- setting a 10:1 handle:: + + cat /sys/fs/cgroup/net_cls/0/net_cls.classid + 1048577 + +- configuring tc:: + + tc qdisc add dev eth0 root handle 10: htb + tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + +- creating traffic class 10:1:: + + tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example:: + + iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst new file mode 100644 index 000000000000..b40905871c64 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst @@ -0,0 +1,57 @@ +======================= +Network priority cgroup +======================= + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx + This file is read-only, and is simply informative. It contains a unique + integer value that the kernel uses as an internal representation of this + cgroup. + +net_prio.ifpriomap + This file contains a map of the priorities assigned to traffic originating + from processes in this group and egressing the system on various interfaces. + It contains a list of tuples in the form . Contents of this + file can be modified by echoing a string into the file using the same tuple + format. For example:: + + echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst new file mode 100644 index 000000000000..6acebd9e72c8 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/pids.rst @@ -0,0 +1,92 @@ +========================= +Process Number Controller +========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +The pids.events file contains event counters: + + - max: Number of times fork failed because limit was hit. + +Example +------- + +First, we mount the pids controller:: + + # mkdir -p /sys/fs/cgroup/pids + # mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it:: + + # mkdir -p /sys/fs/cgroup/pids/parent/child + # echo 2 > /sys/fs/cgroup/pids/parent/pids.max + # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail:: + + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's):: + + # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.max + max + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current):: + + # echo 1 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # echo 0 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst new file mode 100644 index 000000000000..2fcb0a9bf790 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/rdma.rst @@ -0,0 +1,117 @@ +=============== +RDMA Controller +=============== + +.. Contents + + 1. Overview + 1-1. What is RDMA controller? + 1-2. Why RDMA controller needed? + 1-3. How is RDMA controller implemented? + 2. Usage Examples + +1. Overview +=========== + +1-1. What is RDMA controller? +----------------------------- + +RDMA controller allows user to limit RDMA/IB specific resources that a given +set of processes can use. These processes are grouped using RDMA controller. + +RDMA controller defines two resources which can be limited for processes of a +cgroup. + +1-2. Why RDMA controller needed? +-------------------------------- + +Currently user space applications can easily take away all the rdma verb +specific resources such as AH, CQ, QP, MR etc. Due to which other applications +in other cgroup or kernel space ULPs may not even get chance to allocate any +rdma resources. This can lead to service unavailability. + +Therefore RDMA controller is needed through which resource consumption +of processes can be limited. Through this controller different rdma +resources can be accounted. + +1-3. How is RDMA controller implemented? +---------------------------------------- + +RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains +resource accounting per cgroup, per device using resource pool structure. +Each such resource pool is limited up to 64 resources in given resource pool +by rdma cgroup, which can be extended later if required. + +This resource pool object is linked to the cgroup css. Typically there +are 0 to 4 resource pool instances per cgroup, per device in most use cases. +But nothing limits to have it more. At present hundreds of RDMA devices per +single cgroup may not be handled optimally, however there is no +known use case or requirement for such configuration either. + +Since RDMA resources can be allocated from any process and can be freed by any +of the child processes which shares the address space, rdma resources are +always owned by the creator cgroup css. This allows process migration from one +to other cgroup without major complexity of transferring resource ownership; +because such ownership is not really present due to shared nature of +rdma resources. Linking resources around css also ensures that cgroups can be +deleted after processes migrated. This allow progress migration as well with +active resources, even though that is not a primary use case. + +Whenever RDMA resource charging occurs, owner rdma cgroup is returned to +the caller. Same rdma cgroup should be passed while uncharging the resource. +This also allows process migrated with active RDMA resource to charge +to new owner cgroup for new resource. It also allows to uncharge resource of +a process from previously charged cgroup which is migrated to new cgroup, +even though that is not a primary use case. + +Resource pool object is created in following situations. +(a) User sets the limit and no previous resource pool exist for the device +of interest for the cgroup. +(b) No resource limits were configured, but IB/RDMA stack tries to +charge the resource. So that it correctly uncharge them when applications are +running without limits and later on when limits are enforced during uncharging, +otherwise usage count will drop to negative. + +Resource pool is destroyed if all the resource limits are set to max and +it is the last resource getting deallocated. + +User should set all the limit to max value if it intents to remove/unconfigure +the resource pool for a particular device. + +IB stack honors limits enforced by the rdma controller. When application +query about maximum resource limits of IB device, it returns minimum of +what is configured by user for a given cgroup and what is supported by +IB device. + +Following resources can be accounted by rdma controller. + + ========== ============================= + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + ========== ============================= + +2. Usage Examples +================= + +(a) Configure resource limit:: + + echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max + echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max + +(b) Query resource limit:: + + cat /sys/fs/cgroup/rdma/2/rdma.max + #Output: + mlx4_0 hca_handle=2 hca_object=2000 + ocrdma1 hca_handle=3 hca_object=max + +(c) Query current usage:: + + cat /sys/fs/cgroup/rdma/2/rdma.current + #Output: + mlx4_0 hca_handle=1 hca_object=20 + ocrdma1 hca_handle=1 hca_object=23 + +(d) Delete resource limit:: + + echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 080b18ce2a5d..ed4c5977d6e1 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and conventions of cgroup v2. It describes all userland-visible aspects of cgroup including core and specific controller behaviors. All future changes must be reflected in this document. Documentation for -v1 is available under Documentation/cgroup-v1/. +v1 is available under Documentation/admin-guide/cgroup-v1/. .. CONTENTS diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 1f0d9b939311..a5fdb1a846ce 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking. initrd cgroup-v2 + cgroup-v1/index serial-console braille-console parport diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 78576aa45cce..a571a67e0c85 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4089,7 +4089,7 @@ relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. - See Documentation/cgroup-v1/cpusets.rst. + See Documentation/admin-guide/cgroup-v1/cpusets.rst. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] @@ -4599,7 +4599,7 @@ swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/cgroup-v1/memory.rst) + it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { | force | noforce } diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 546f174e5d6a..8463f5538fda 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy support. Memory policies should not be confused with cpusets -(``Documentation/cgroup-v1/cpusets.rst``) +(``Documentation/admin-guide/cgroup-v1/cpusets.rst``) which is an administrative mechanism for restricting the nodes from which memory may be allocated by a set of processes. Memory policies are a programming interface that a NUMA-aware application can take advantage of. When diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst index 2c13b2fc1888..0d237d402860 100644 --- a/Documentation/block/bfq-iosched.rst +++ b/Documentation/block/bfq-iosched.rst @@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all the stat files documented in -Documentation/cgroup-v1/blkio-controller.rst. If, instead, +Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead, CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files:: blkio.bfq.io_service_bytes diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst deleted file mode 100644 index 1d7d962933be..000000000000 --- a/Documentation/cgroup-v1/blkio-controller.rst +++ /dev/null @@ -1,302 +0,0 @@ -=================== -Block IO Controller -=================== - -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -One IO control policy is throttling policy which can be used to -specify upper IO rate limits on devices. This policy is implemented in -generic block layer and can be used on leaf nodes as well as higher -level logical devices like device mapper. - -HOWTO -===== -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller:: - - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer:: - - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: - - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ":: - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not:: - - # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Throttling implements hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows:: - - root - / \ - test1 test2 - | - test3 - -Throttling with "sane_behavior" will handle the -hierarchy correctly. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following:: - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_BFQ_CGROUP_DEBUG - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format:: - - # echo dev_maj:dev_minor weight > blkio.weight_device - - Configure weight=300 on /dev/sdb (8:16) in this cgroup:: - - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup:: - - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup:: - - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst deleted file mode 100644 index 46bbe7e022d4..000000000000 --- a/Documentation/cgroup-v1/cgroups.rst +++ /dev/null @@ -1,695 +0,0 @@ -============== -Control Groups -============== - -Written by Paul Menage based on -Documentation/cgroup-v1/cpusets.rst - -Original copyright statements from cpusets.txt: - -Portions Copyright (C) 2004 BULL SA. - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. - -Modified by Paul Jackson - -Modified by Christoph Lameter - -.. CONTENTS: - - 1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name - 3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API - 4. Extended attributes usage - 5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines:: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can:: - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of:: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like:: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup:: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type:: - - # mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?` you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group:: - - # mount -t tmpfs cgroup_root /sys/fs/cgroup - # mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type:: - - # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent:: - - # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent:: - - # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1:: - - # cd /sys/fs/cgroup/rg1 - # mkdir my_cgroup - -Now you want to do something with this cgroup: - - # cd my_cgroup - -In this directory you can find several files:: - - # ls - cgroup.procs notify_on_release tasks - (plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup:: - - # /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cgroup, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0:: - - # echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _cgrp_subsys - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -``int css_online(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -``void css_offline(struct cgroup *cgrp);`` -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -``void css_free(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -``void css_reset(struct cgroup_subsys_state *css)`` -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -``void fork(struct task_struct *task)`` - -Called when a task is forked into a cgroup. - -``void exit(struct task_struct *task)`` - -Called during task exit. - -``void free(struct task_struct *task)`` - -Called when the task_struct is freed. - -``void bind(struct cgroup *root)`` -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -:: - - Q: what's up with this '/bin/echo' ? - A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - - Q: When I attach processes, only the first of the line gets really attached ! - A: We can only return one error code per call to write(). So you should also - put only ONE PID. diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst deleted file mode 100644 index d30ed81d2ad7..000000000000 --- a/Documentation/cgroup-v1/cpuacct.rst +++ /dev/null @@ -1,50 +0,0 @@ -========================= -CPU Accounting Controller -========================= - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst deleted file mode 100644 index b6a42cdea72b..000000000000 --- a/Documentation/cgroup-v1/cpusets.rst +++ /dev/null @@ -1,866 +0,0 @@ -======= -CPUSETS -======= - -Copyright (C) 2004 BULL SA. - -Written by Simon.Derr@bull.net - -- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -- Modified by Paul Jackson -- Modified by Christoph Lameter -- Modified by Paul Menage -- Modified by Hidetoshi Seto - -.. CONTENTS: - - 1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes - 3. Questions - 4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroup-v1/cgroups.rst. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example:: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> - Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - -====== =========================================================== - -1 no request. use system default or follow request of others. - 0 no search. - 1 search siblings (hyperthreads in a core). - 2 search cores in a package. - 3 search cpus in a node [= system wide on non-NUMA system] - 4 search nodes in a chunk of node [on NUMA system] - 5 search system wide [on NUMA system] -====== =========================================================== - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. - then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpuset's 'tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset:: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: - - # cd /sys/fs/cgroup/cpuset - # mkdir my_cpuset - -Now you want to do something with this cpuset:: - - # cd my_cpuset - -In this directory you can find several files:: - - # ls - cgroup.clone_children cpuset.memory_pressure - cgroup.event_control cpuset.memory_spread_page - cgroup.procs cpuset.memory_spread_slab - cpuset.cpu_exclusive cpuset.mems - cpuset.cpus cpuset.sched_load_balance - cpuset.mem_exclusive cpuset.sched_relax_domain_level - cpuset.mem_hardwall notify_on_release - cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags:: - - # /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus:: - - # /bin/echo 0-7 > cpuset.cpus - -Add some mems:: - - # /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset:: - - # /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cpuset, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command:: - - mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to:: - - mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset - echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories:: - - # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset:: - - # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs:: - - # /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple:: - - # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' - # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: - what's up with this '/bin/echo' ? - -A: - bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: - When I attach processes, only the first of the line gets really attached ! - -A: - We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst deleted file mode 100644 index e1886783961e..000000000000 --- a/Documentation/cgroup-v1/devices.rst +++ /dev/null @@ -1,132 +0,0 @@ -=========================== -Device Whitelist Controller -=========================== - -1. Description -============== - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface -================= - -An entry is added using devices.allow, and removed using -devices.deny. For instance:: - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing:: - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing:: - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security -=========== - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy -============ - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example:: - - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A:: - - # echo "c 116:* r" > A/devices.deny - -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed:: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated:: - - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding ``c *:3 rwm``:: - - # echo "c *:3 rwm" >A/devices.allow - -the result:: - - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B:: - - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow - -or even:: - - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) ---------------------------------------- - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst deleted file mode 100644 index 582d3427de3f..000000000000 --- a/Documentation/cgroup-v1/freezer-subsystem.rst +++ /dev/null @@ -1,127 +0,0 @@ -============== -Cgroup Freezer -============== - -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells:: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage:: - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem:: - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container:: - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container:: - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst deleted file mode 100644 index a3902aa253a9..000000000000 --- a/Documentation/cgroup-v1/hugetlb.rst +++ /dev/null @@ -1,50 +0,0 @@ -================== -HugeTLB Controller -================== - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files:: - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting three hugepage sizes (64k, 32M and 1G), the control -files include:: - - hugetlb.1GB.limit_in_bytes - hugetlb.1GB.max_usage_in_bytes - hugetlb.1GB.usage_in_bytes - hugetlb.1GB.failcnt - hugetlb.64KB.limit_in_bytes - hugetlb.64KB.max_usage_in_bytes - hugetlb.64KB.usage_in_bytes - hugetlb.64KB.failcnt - hugetlb.32MB.limit_in_bytes - hugetlb.32MB.max_usage_in_bytes - hugetlb.32MB.usage_in_bytes - hugetlb.32MB.failcnt diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst deleted file mode 100644 index fe76d42edc11..000000000000 --- a/Documentation/cgroup-v1/index.rst +++ /dev/null @@ -1,30 +0,0 @@ -:orphan: - -======================== -Control Groups version 1 -======================== - -.. toctree:: - :maxdepth: 1 - - cgroups - - blkio-controller - cpuacct - cpusets - devices - freezer-subsystem - hugetlb - memcg_test - memory - net_cls - net_prio - pids - rdma - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst deleted file mode 100644 index 91bd18c6a514..000000000000 --- a/Documentation/cgroup-v1/memcg_test.rst +++ /dev/null @@ -1,355 +0,0 @@ -===================================================== -Memory Resource Controller(Memcg) Implementation Memo -===================================================== - -Last Updated: 2010/2 - -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroup-v1/memory.rst) - -0. How to record usage ? -======================== - - 2 objects are used. - - page_cgroup ....an object per page. - - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge -========= - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge -=========== - - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel -======================= - - Memcg pages are charged in two steps: - - - mem_cgroup_try_charge() - - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous -============ - - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache -============= - - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - - Note: - __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache -=========================== - - The best way to understand shmem's page state transition is to read - mm/shmem.c. - - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration -================= - - mem_cgroup_migrate() - -8. LRU -====== - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global pgdat->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under pgdat->lru_lock. - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. -================= - - Tests for racy cases. - -9.1 Small limit to memcg. -------------------------- - - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - -9.2 Shmem ---------- - - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - -9.3 Migration -------------- - - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration:: - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset.:: - - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - -9.4 Memory hotplug ------------------- - - memory hotplug test is one of good test. - - to offline memory, do following:: - - # echo offline > /sys/devices/system/memory/memoryXXX/state - - (XXX is the place of memory) - - This is an easy way to test page migration, too. - -9.5 mkdir/rmdir ---------------- - - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following:: - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running:: - - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - -9.6 Mount with other subsystems -------------------------------- - - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example:: - - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - -9.7 swapoff ------------ - - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good: - - (Shell-A):: - - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - - Run malloc(100M) program under this. You'll see 60M of swaps. - - (Shell-B):: - - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - -9.8 OOM-Killer --------------- - - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - - Case A) when you can swapoff:: - - #swapoff -a - #echo 50M > /memory.limit_in_bytes - - run 51M of malloc - - Case B) when you use mem+swap limitation:: - - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - - run 51M of malloc - -9.9 Move charges at task migration ----------------------------------- - - Charges associated with a task can be moved along with task migration. - - (Shell-A):: - - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B):: - - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading ``*.usage_in_bytes`` or - memory.stat of both A and B. - - See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should - be written to move_charge_at_immigrate. - -9.10 Memory thresholds ----------------------- - - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener:: - - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory:: - - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst deleted file mode 100644 index 41bdc038dad9..000000000000 --- a/Documentation/cgroup-v1/memory.rst +++ /dev/null @@ -1,1003 +0,0 @@ -========================== -Memory Resource Controller -========================== - -NOTE: - This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: - The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller -============================================= - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - -==================================== ========================================== - tasks attach a task(thread) and show list of - threads - cgroup.procs show list of processes - cgroup.event_control an interface for event_fd() - memory.usage_in_bytes show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes set/show limit of memory usage - memory.memsw.limit_in_bytes set/show limit of memory+Swap usage - memory.failcnt show the number of memory usage hits limits - memory.memsw.failcnt show the number of memory+Swap hits limits - memory.max_usage_in_bytes show max memory usage recorded - memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded - memory.soft_limit_in_bytes set/show soft limit of memory usage - memory.stat show various statistics - memory.use_hierarchy set/show hierarchical account enabled - memory.force_empty trigger forced page reclaim - memory.pressure_level set memory pressure notifications - memory.swappiness set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate set/show controls of moving charges - memory.oom_control set/show oom controls. - memory.numa_stat show the number of memory usage per numa - node - - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - memory.kmem.usage_in_bytes show current kernel memory allocation - memory.kmem.failcnt show the number of kernel memory usage - hits limits - memory.kmem.max_usage_in_bytes show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation - memory.kmem.tcp.failcnt show the number of tcp buf memory usage - hits limits - memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded -==================================== ========================================== - -1. History -========== - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control -================= - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design ------------ - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting ---------------- - -:: - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details ------------------------- - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting --------------------------- - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) --------------------------------------- - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -**why 'memory+swap' rather than swap** - -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -**What happens when a cgroup hits memory.memsw.limit_in_bytes** - -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim ------------ - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: - Reclaim does not work for the root cgroup, since we cannot set any - limits on the root cgroup. - -Note2: - When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking ------------ - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - the i_pages lock. - - Other lock order is following: - - PG_locked. - mm->page_table_lock - pgdat->lru_lock - lock_page_cgroup. - - In many cases, just lock_page_cgroup() is called. - - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - pgdat->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) ------------------------------------------------ - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). - -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted ------------------------------------------------ - -stack pages: - every process consumes some stack pages. By accounting into - kernel memory, we prevent new processes from being created when the kernel - memory usage is too high. - -slab pages: - pages allocated by the SLAB or SLUB allocator are tracked. A copy - of each kmem_cache is created every time the cache is touched by the first time - from inside the memcg. The creation is done lazily, so some objects can still be - skipped while the cache is being created. All objects in a slab page should - belong to the same memcg. This only fails to hold when a task is migrated to a - different memcg during the page allocation by the cache. - -sockets memory pressure: - some sockets protocols have memory pressure - thresholds. The Memory Controller allows them to be controlled individually - per cgroup, instead of globally. - -tcp memory pressure: - sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases ----------------------- - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - -U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - -U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - -WARNING: - In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - -U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface -================= - -3.0. Configuration ------------------- - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -------------------------------------------------------------------- - -:: - - # mount -t tmpfs none /sys/fs/cgroup - # mkdir /sys/fs/cgroup/memory - # mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it:: - - # mkdir /sys/fs/cgroup/memory/0 - # echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit:: - - # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: - We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, - mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, - Gibibytes.) - -NOTE: - We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. - -NOTE: - We cannot set limits on the root cgroup any more. - -:: - - # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes - 4194304 - -We can check the usage:: - - # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes - 1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel:: - - # echo 1 > memory.limit_in_bytes - # cat memory.limit_in_bytes - 4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing -========== - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting -------------------- - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration ------------------- - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup ---------------------- - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces -=================== - -5.1 force_empty ---------------- - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this:: - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Though rmdir() offlines memcg, but the memcg may still stay there due to - charged file caches. Some out-of-use page caches may keep charged until - memory pressure happens. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file -------------- - -memory.stat file includes following statistics - -per-memory cgroup local status -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -=============== =============================================================== -cache # of bytes of page cache memory. -rss # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge # of bytes of anonymous transparent hugepages. -mapped_file # of bytes of mapped file (includes tmpfs/shmem) -pgpgin # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap # of bytes of swap usage -dirty # of bytes that are waiting to get written back to the disk. -writeback # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file # of bytes of file-backed memory on inactive LRU list. -active_file # of bytes of file-backed memory on active LRU list. -unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). -=============== =============================================================== - -status considering hierarchy (see memory.use_hierarchy settings) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= =================================================== -hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache -========================= =================================================== - -The following additional stats are dependent on CONFIG_DEBUG_VM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= ======================================== -recent_rotated_anon VM internal parameter. (see mm/vmscan.c) -recent_rotated_file VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon VM internal parameter. (see mm/vmscan.c) -recent_scanned_file VM internal parameter. (see mm/vmscan.c) -========================= ======================================== - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - - 'rss + mapped_file" will give you resident set size of cgroup. - - (Note: file and shmem may be shared among other cgroups. In that case, - mapped_file is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness --------------- - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt ------------ - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file:: - - # echo 0 > .../memory.failcnt - -5.5 usage_in_bytes ------------------- - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat -------------- - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is:: - - total= N0= N1= ... - file= N0= N1= ... - anon= N0= N1= ... - unevictable= N0= N1= ... - hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support -==================== - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy:: - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim ------------------------------------------------- - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: - - # echo 1 > memory.use_hierarchy - -The feature can be disabled by:: - - # echo 0 > memory.use_hierarchy - -NOTE1: - Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: - When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits -============== - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface -------------- - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB):: - - # echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use:: - - # echo 1G > memory.soft_limit_in_bytes - -NOTE1: - Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: - It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration -================================= - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface -------------- - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it:: - - # echo (some positive value) > memory.move_charge_at_immigrate - -Note: - Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: - Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: - If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: - It can take several seconds if you move charges much. - -And if you want disable it again:: - - # echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved --------------------------------------- - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - -+---+--------------------------------------------------------------------------+ -|bit| what type of charges would be moved ? | -+===+==========================================================================+ -| 0 | A charge of an anonymous page (or swap of it) used by the target task. | -| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | -+---+--------------------------------------------------------------------------+ -| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | -| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | -| | anonymous pages, file pages (and swaps) in the range mmapped by the task | -| | will be moved even if the task hasn't done page fault, i.e. they might | -| | not be the task's "RSS", but other task's "RSS" that maps the same file. | -| | And mapcount of the page is ignored (the page can be moved even if | -| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | -| | enable move of swap charges. | -+---+--------------------------------------------------------------------------+ - -8.3 TODO --------- - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds -==================== - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: - -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control -=============== - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - - * enlarge limit or reduce usage. - -To reduce usage, - - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - - - oom_kill_disable 0 or 1 - (if 1, oom-killer is disabled) - - under_oom 0 or 1 - (if 1, the memory cgroup is under OOM, tasks may be stopped.) - -11. Memory Pressure -=================== - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -By default, events are propagated upward until the event is handled, i.e. the -events are not pass-through. For example, you have three cgroups: A->B->C. Now -you set up an event listener on cgroups A, B and C, and suppose group C -experiences some pressure. In this situation, only group C will receive the -notification, i.e. groups A and B will not receive it. This is done to avoid -excessive "broadcasting" of messages, which disturbs the system and which is -especially bad if we are low on memory or thrashing. Group B, will receive -notification only if there are no event listers for group C. - -There are three optional modes that specify different propagation behavior: - - - "default": this is the default behavior specified above. This mode is the - same as omitting the optional mode parameter, preserved by backwards - compatibility. - - - "hierarchy": events always propagate up to the root, similar to the default - behavior, except that propagation continues regardless of whether there are - event listeners at each level, with the "hierarchy" mode. In the above - example, groups A, B, and C will receive notification of memory pressure. - - - "local": events are pass-through, i.e. they only receive notifications when - memory pressure is experienced in the memcg for which the notification is - registered. In the above example, group C will receive notification if - registered for "local" notification and the group experiences memory - pressure. However, group B will never receive notification, regardless if - there is an event listener for group C or not, if group B is registered for - local notification. - -The level and event notification mode ("hierarchy" or "local", if necessary) are -specified by a comma-delimited string, i.e. "low,hierarchy" specifies -hierarchical, pass-through, notification for all ancestor memcgs. Notification -that is the default, non pass-through behavior, does not specify a mode. -"medium,local" specifies pass-through notification for the medium level. - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string as " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure:: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low,hierarchy & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO -======== - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary -======= - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References -========== - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst deleted file mode 100644 index a2cf272af7a0..000000000000 --- a/Documentation/cgroup-v1/net_cls.rst +++ /dev/null @@ -1,44 +0,0 @@ -========================= -Network classifier cgroup -========================= - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example:: - - mkdir /sys/fs/cgroup/net_cls - mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls - mkdir /sys/fs/cgroup/net_cls/0 - echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - -- setting a 10:1 handle:: - - cat /sys/fs/cgroup/net_cls/0/net_cls.classid - 1048577 - -- configuring tc:: - - tc qdisc add dev eth0 root handle 10: htb - tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - -- creating traffic class 10:1:: - - tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example:: - - iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst deleted file mode 100644 index b40905871c64..000000000000 --- a/Documentation/cgroup-v1/net_prio.rst +++ /dev/null @@ -1,57 +0,0 @@ -======================= -Network priority cgroup -======================= - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx - This file is read-only, and is simply informative. It contains a unique - integer value that the kernel uses as an internal representation of this - cgroup. - -net_prio.ifpriomap - This file contains a map of the priorities assigned to traffic originating - from processes in this group and egressing the system on various interfaces. - It contains a list of tuples in the form . Contents of this - file can be modified by echoing a string into the file using the same tuple - format. For example:: - - echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst deleted file mode 100644 index 6acebd9e72c8..000000000000 --- a/Documentation/cgroup-v1/pids.rst +++ /dev/null @@ -1,92 +0,0 @@ -========================= -Process Number Controller -========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -The pids.events file contains event counters: - - - max: Number of times fork failed because limit was hit. - -Example -------- - -First, we mount the pids controller:: - - # mkdir -p /sys/fs/cgroup/pids - # mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it:: - - # mkdir -p /sys/fs/cgroup/pids/parent/child - # echo 2 > /sys/fs/cgroup/pids/parent/pids.max - # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail:: - - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's):: - - # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.max - max - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current):: - - # echo 1 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # echo 0 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst deleted file mode 100644 index 2fcb0a9bf790..000000000000 --- a/Documentation/cgroup-v1/rdma.rst +++ /dev/null @@ -1,117 +0,0 @@ -=============== -RDMA Controller -=============== - -.. Contents - - 1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? - 2. Usage Examples - -1. Overview -=========== - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can lead to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - - ========== ============================= - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - ========== ============================= - -2. Usage Examples -================= - -(a) Configure resource limit:: - - echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max - echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit:: - - cat /sys/fs/cgroup/rdma/2/rdma.max - #Output: - mlx4_0 hca_handle=2 hca_object=2000 - ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage:: - - cat /sys/fs/cgroup/rdma/2/rdma.current - #Output: - mlx4_0 hca_handle=1 hca_object=20 - ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit:: - - echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index cad797a8a39e..5ecbc03e6b2f 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed +[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 5623b9916411..4f18456dd3b1 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -12,7 +12,7 @@ References - Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. -- Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - man taskset: Using the taskset command to bind tasks to sets of CPUs. diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst index 3391e86d810c..14a2f7bf63fe 100644 --- a/Documentation/scheduler/sched-deadline.rst +++ b/Documentation/scheduler/sched-deadline.rst @@ -669,7 +669,7 @@ Deadline Task Scheduling -deadline tasks cannot have an affinity mask smaller that the entire root_domain they are created on. However, affinities can be specified - through the cpuset facility (Documentation/cgroup-v1/cpusets.rst). + through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst). 5.1 SCHED_DEADLINE and cpusets HOWTO ------------------------------------ diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index 53b30d1967cf..a96c72651877 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -222,7 +222,7 @@ SCHED_BATCH) tasks. These options need CONFIG_CGROUPS to be defined, and let the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroup-v1/cgroups.rst for more information about this filesystem. + Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem. When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index d27d3f3712fd..655a096ec8fb 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "/cpu.rt_runtime_us" to control the CPU time reserved for each control group. For more information on working with control groups, you should read -Documentation/cgroup-v1/cgroups.rst as well. +Documentation/admin-guide/cgroup-v1/cgroups.rst as well. Group settings are checked against the following limits in order to keep the configuration schedulable: diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 130f3cfa1c19..99fdeca917ca 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst @@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells' physical memory. NUMA emluation is useful for testing NUMA kernel and application features on non-NUMA platforms, and as a sort of memory resource management mechanism when used together with cpusets. -[see Documentation/cgroup-v1/cpusets.rst] +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] For each node with memory, Linux constructs an independent memory management subsystem, complete with its own free page lists, in-use page lists, usage @@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see System administrators can restrict the CPUs and nodes' memories that a non- privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst] +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] On architectures that do not hide memoryless nodes, Linux will include only zones [nodes] with memory in the zonelists. This means that for a memoryless diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 35bba27d5fff..1d6cd7db4e43 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -41,7 +41,7 @@ locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to move pages when a task is moved to another cpuset (See -Documentation/cgroup-v1/cpusets.rst). +Documentation/admin-guide/cgroup-v1/cpusets.rst). Cpusets allows the automation of process locality. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically. Also the pages diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index 109052215bce..17d0861b0f1d 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst @@ -98,7 +98,7 @@ Memory Control Group Interaction -------------------------------- The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/cgroup-v1/memory.rst] by extending the +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the lru_list enum. The memory controller data structure automatically gets a per-zone unevictable diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 30108684ae87..ff9bcfd2cc14 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the amount of system memory that are available to a certain class of tasks. For more information on the features of cpusets, see -Documentation/cgroup-v1/cpusets.rst. +Documentation/admin-guide/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. @@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:: On node 3 totalpages: 131072 Now following the instructions for mounting the cpusets filesystem from -Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory +Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory address spaces) to individual cpusets:: [root@xroads /]# mkdir exampleset diff --git a/MAINTAINERS b/MAINTAINERS index 0c603ea73034..c1593a668f80 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4158,7 +4158,7 @@ L: cgroups@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained F: Documentation/admin-guide/cgroup-v2.rst -F: Documentation/cgroup-v1/ +F: Documentation/admin-guide/cgroup-v1/ F: include/linux/cgroup* F: kernel/cgroup/ @@ -4169,7 +4169,7 @@ W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroup-v1/cpusets.rst +F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: include/linux/cpuset.h F: kernel/cgroup/cpuset.c diff --git a/block/Kconfig b/block/Kconfig index b16b3e075d31..8b5f8e560eb4 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING one needs to mount and use blkio cgroup controller for creating cgroups and specifying per device IO rate policies. - See Documentation/cgroup-v1/blkio-controller.rst for more information. + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. config BLK_DEV_THROTTLING_LOW bool "Block throttling .low limit interface support (EXPERIMENTAL)" diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c5311935239d..430e219e3aba 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -624,7 +624,7 @@ struct cftype { /* * Control Group subsystem type. - * See Documentation/cgroup-v1/cgroups.rst for details + * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6f68438aa4ed..82699845ef79 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -806,7 +806,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.rst*. + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can diff --git a/init/Kconfig b/init/Kconfig index 9eb92ee52d40..381cdfee6e0e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -821,7 +821,7 @@ menuconfig CGROUPS controls or device isolation. See - Documentation/scheduler/sched-design-CFS.rst (CFS) - - Documentation/cgroup-v1/ (features for grouping, isolation + - Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation and resource control) Say N if unsure. @@ -883,7 +883,7 @@ config BLK_CGROUP CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. - See Documentation/cgroup-v1/blkio-controller.rst for more information. + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. config CGROUP_WRITEBACK bool diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b3b02b9c4405..863e434a6020 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this diff --git a/security/device_cgroup.c b/security/device_cgroup.c index c07196502577..725674f3276d 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent) * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. - * Refer to Documentation/cgroup-v1/devices.rst for more details. + * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f506c68b2612..17e2b1713702 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -806,7 +806,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.rst*. + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can -- cgit v1.2.3 From 03638e62f55f27e7a96d6b1175e75b7a81e562b3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 10 Apr 2019 09:56:05 -0700 Subject: LSM: SafeSetID: rewrite userspace API to atomic updates The current API of the SafeSetID LSM uses one write() per rule, and applies each written rule instantly. This has several downsides: - While a policy is being loaded, once a single parent-child pair has been loaded, the parent is restricted to that specific child, even if subsequent rules would allow transitions to other child UIDs. This means that during policy loading, set*uid() can randomly fail. - To replace the policy without rebooting, it is necessary to first flush all old rules. This creates a time window in which no constraints are placed on the use of CAP_SETUID. - If we want to perform sanity checks on the final policy, this requires that the policy isn't constructed in a piecemeal fashion without telling the kernel when it's done. Other kernel APIs - including things like the userns code and netfilter - avoid this problem by performing updates atomically. Luckily, SafeSetID hasn't landed in a stable (upstream) release yet, so maybe it's not too late to completely change the API. The new API for SafeSetID is: If you want to change the policy, open "safesetid/whitelist_policy" and write the entire policy, newline-delimited, in there. Signed-off-by: Jann Horn Signed-off-by: Micah Morton --- security/safesetid/lsm.c | 84 +++------ security/safesetid/lsm.h | 24 +-- security/safesetid/securityfs.c | 194 ++++++++++++--------- tools/testing/selftests/safesetid/safesetid-test.c | 16 +- 4 files changed, 149 insertions(+), 169 deletions(-) (limited to 'tools') diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index 9db1c7a51d3d..22964e2a6187 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -24,28 +24,38 @@ /* Flag indicating whether initialization completed */ int safesetid_initialized; -#define NUM_BITS 8 /* 256 buckets in hash table */ +struct setuid_ruleset __rcu *safesetid_setuid_rules; -static DEFINE_HASHTABLE(safesetid_whitelist_hashtable, NUM_BITS); - -static DEFINE_SPINLOCK(safesetid_whitelist_hashtable_spinlock); - -static enum sid_policy_type setuid_policy_lookup(kuid_t src, kuid_t dst) +/* Compute a decision for a transition from @src to @dst under @policy. */ +enum sid_policy_type _setuid_policy_lookup(struct setuid_ruleset *policy, + kuid_t src, kuid_t dst) { - struct entry *entry; + struct setuid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; - rcu_read_lock(); - hash_for_each_possible_rcu(safesetid_whitelist_hashtable, - entry, next, __kuid_val(src)) { - if (!uid_eq(entry->src_uid, src)) + hash_for_each_possible(policy->rules, rule, next, __kuid_val(src)) { + if (!uid_eq(rule->src_uid, src)) continue; - if (uid_eq(entry->dst_uid, dst)) { - rcu_read_unlock(); + if (uid_eq(rule->dst_uid, dst)) return SIDPOL_ALLOWED; - } result = SIDPOL_CONSTRAINED; } + return result; +} + +/* + * Compute a decision for a transition from @src to @dst under the active + * policy. + */ +static enum sid_policy_type setuid_policy_lookup(kuid_t src, kuid_t dst) +{ + enum sid_policy_type result = SIDPOL_DEFAULT; + struct setuid_ruleset *pol; + + rcu_read_lock(); + pol = rcu_dereference(safesetid_setuid_rules); + if (pol) + result = _setuid_policy_lookup(pol, src, dst); rcu_read_unlock(); return result; } @@ -139,52 +149,6 @@ static int safesetid_task_fix_setuid(struct cred *new, return -EACCES; } -int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child) -{ - struct entry *new; - - /* Return if entry already exists */ - if (setuid_policy_lookup(parent, child) == SIDPOL_ALLOWED) - return 0; - - new = kzalloc(sizeof(struct entry), GFP_KERNEL); - if (!new) - return -ENOMEM; - new->src_uid = parent; - new->dst_uid = child; - spin_lock(&safesetid_whitelist_hashtable_spinlock); - hash_add_rcu(safesetid_whitelist_hashtable, - &new->next, - __kuid_val(parent)); - spin_unlock(&safesetid_whitelist_hashtable_spinlock); - return 0; -} - -void flush_safesetid_whitelist_entries(void) -{ - struct entry *entry; - struct hlist_node *hlist_node; - unsigned int bkt_loop_cursor; - HLIST_HEAD(free_list); - - /* - * Could probably use hash_for_each_rcu here instead, but this should - * be fine as well. - */ - spin_lock(&safesetid_whitelist_hashtable_spinlock); - hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor, - hlist_node, entry, next) { - hash_del_rcu(&entry->next); - hlist_add_head(&entry->dlist, &free_list); - } - spin_unlock(&safesetid_whitelist_hashtable_spinlock); - synchronize_rcu(); - hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) { - hlist_del(&entry->dlist); - kfree(entry); - } -} - static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(capable, safesetid_security_capable) diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h index 6806f902794c..4a34f558d964 100644 --- a/security/safesetid/lsm.h +++ b/security/safesetid/lsm.h @@ -21,12 +21,6 @@ /* Flag indicating whether initialization completed */ extern int safesetid_initialized; -/* Function type. */ -enum safesetid_whitelist_file_write_type { - SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */ - SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */ -}; - enum sid_policy_type { SIDPOL_DEFAULT, /* source ID is unaffected by policy */ SIDPOL_CONSTRAINED, /* source ID is affected by policy */ @@ -35,18 +29,24 @@ enum sid_policy_type { /* * Hash table entry to store safesetid policy signifying that 'src_uid' - * can setid to 'dst_uid'. + * can setuid to 'dst_uid'. */ -struct entry { +struct setuid_rule { struct hlist_node next; - struct hlist_node dlist; /* for deletion cleanup */ kuid_t src_uid; kuid_t dst_uid; }; -/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */ -int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child); +#define SETID_HASH_BITS 8 /* 256 buckets in hash table */ + +struct setuid_ruleset { + DECLARE_HASHTABLE(rules, SETID_HASH_BITS); + struct rcu_head rcu; +}; + +enum sid_policy_type _setuid_policy_lookup(struct setuid_ruleset *policy, + kuid_t src, kuid_t dst); -void flush_safesetid_whitelist_entries(void); +extern struct setuid_ruleset __rcu *safesetid_setuid_rules; #endif /* _SAFESETID_H */ diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index 77d301f0ff7a..250d59e046c1 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -11,25 +11,15 @@ * published by the Free Software Foundation. * */ + +#define pr_fmt(fmt) "SafeSetID: " fmt + #include #include #include "lsm.h" -static struct dentry *safesetid_policy_dir; - -struct safesetid_file_entry { - const char *name; - enum safesetid_whitelist_file_write_type type; - struct dentry *dentry; -}; - -static struct safesetid_file_entry safesetid_files[] = { - {.name = "add_whitelist_policy", - .type = SAFESETID_WHITELIST_ADD}, - {.name = "flush_whitelist_policies", - .type = SAFESETID_WHITELIST_FLUSH}, -}; +static DEFINE_SPINLOCK(policy_update_lock); /* * In the case the input buffer contains one or more invalid UIDs, the kuid_t @@ -37,8 +27,8 @@ static struct safesetid_file_entry safesetid_files[] = { * function will return an error. * Contents of @buf may be modified. */ -static int parse_policy_line( - struct file *file, char *buf, kuid_t *parent, kuid_t *child) +static int parse_policy_line(struct file *file, char *buf, + struct setuid_rule *rule) { char *child_str; int ret; @@ -59,26 +49,103 @@ static int parse_policy_line( if (ret) return ret; - *parent = make_kuid(file->f_cred->user_ns, parsed_parent); - *child = make_kuid(file->f_cred->user_ns, parsed_child); - if (!uid_valid(*parent) || !uid_valid(*child)) + rule->src_uid = make_kuid(file->f_cred->user_ns, parsed_parent); + rule->dst_uid = make_kuid(file->f_cred->user_ns, parsed_child); + if (!uid_valid(rule->src_uid) || !uid_valid(rule->dst_uid)) return -EINVAL; return 0; } -static int parse_safesetid_whitelist_policy( - struct file *file, const char __user *buf, size_t len, - kuid_t *parent, kuid_t *child) +static void __release_ruleset(struct rcu_head *rcu) { - char *kern_buf = memdup_user_nul(buf, len); - int ret; + struct setuid_ruleset *pol = + container_of(rcu, struct setuid_ruleset, rcu); + int bucket; + struct setuid_rule *rule; + struct hlist_node *tmp; + + hash_for_each_safe(pol->rules, bucket, tmp, rule, next) + kfree(rule); + kfree(pol); +} - if (IS_ERR(kern_buf)) - return PTR_ERR(kern_buf); - ret = parse_policy_line(file, kern_buf, parent, child); - kfree(kern_buf); - return ret; +static void release_ruleset(struct setuid_ruleset *pol) +{ + call_rcu(&pol->rcu, __release_ruleset); +} + +static ssize_t handle_policy_update(struct file *file, + const char __user *ubuf, size_t len) +{ + struct setuid_ruleset *pol; + char *buf, *p, *end; + int err; + + pol = kmalloc(sizeof(struct setuid_ruleset), GFP_KERNEL); + if (!pol) + return -ENOMEM; + hash_init(pol->rules); + + p = buf = memdup_user_nul(ubuf, len); + if (IS_ERR(buf)) { + err = PTR_ERR(buf); + goto out_free_pol; + } + + /* policy lines, including the last one, end with \n */ + while (*p != '\0') { + struct setuid_rule *rule; + + end = strchr(p, '\n'); + if (end == NULL) { + err = -EINVAL; + goto out_free_buf; + } + *end = '\0'; + + rule = kmalloc(sizeof(struct setuid_rule), GFP_KERNEL); + if (!rule) { + err = -ENOMEM; + goto out_free_buf; + } + + err = parse_policy_line(file, p, rule); + if (err) + goto out_free_rule; + + if (_setuid_policy_lookup(pol, rule->src_uid, rule->dst_uid) == + SIDPOL_ALLOWED) { + pr_warn("bad policy: duplicate entry\n"); + err = -EEXIST; + goto out_free_rule; + } + + hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); + p = end + 1; + continue; + +out_free_rule: + kfree(rule); + goto out_free_buf; + } + + /* + * Everything looks good, apply the policy and release the old one. + * What we really want here is an xchg() wrapper for RCU, but since that + * doesn't currently exist, just use a spinlock for now. + */ + spin_lock(&policy_update_lock); + rcu_swap_protected(safesetid_setuid_rules, pol, + lockdep_is_held(&policy_update_lock)); + spin_unlock(&policy_update_lock); + err = len; + +out_free_buf: + kfree(buf); +out_free_pol: + release_ruleset(pol); + return err; } static ssize_t safesetid_file_write(struct file *file, @@ -86,90 +153,45 @@ static ssize_t safesetid_file_write(struct file *file, size_t len, loff_t *ppos) { - struct safesetid_file_entry *file_entry = - file->f_inode->i_private; - kuid_t parent; - kuid_t child; - int ret; - if (!file_ns_capable(file, &init_user_ns, CAP_MAC_ADMIN)) return -EPERM; if (*ppos != 0) return -EINVAL; - switch (file_entry->type) { - case SAFESETID_WHITELIST_FLUSH: - flush_safesetid_whitelist_entries(); - break; - case SAFESETID_WHITELIST_ADD: - ret = parse_safesetid_whitelist_policy(file, buf, len, - &parent, &child); - if (ret) - return ret; - - ret = add_safesetid_whitelist_entry(parent, child); - if (ret) - return ret; - break; - default: - pr_warn("Unknown securityfs file %d\n", file_entry->type); - break; - } - - /* Return len on success so caller won't keep trying to write */ - return len; + return handle_policy_update(file, buf, len); } static const struct file_operations safesetid_file_fops = { .write = safesetid_file_write, }; -static void safesetid_shutdown_securityfs(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) { - struct safesetid_file_entry *entry = - &safesetid_files[i]; - securityfs_remove(entry->dentry); - entry->dentry = NULL; - } - - securityfs_remove(safesetid_policy_dir); - safesetid_policy_dir = NULL; -} - static int __init safesetid_init_securityfs(void) { - int i; int ret; + struct dentry *policy_dir; + struct dentry *policy_file; if (!safesetid_initialized) return 0; - safesetid_policy_dir = securityfs_create_dir("safesetid", NULL); - if (IS_ERR(safesetid_policy_dir)) { - ret = PTR_ERR(safesetid_policy_dir); + policy_dir = securityfs_create_dir("safesetid", NULL); + if (IS_ERR(policy_dir)) { + ret = PTR_ERR(policy_dir); goto error; } - for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) { - struct safesetid_file_entry *entry = - &safesetid_files[i]; - entry->dentry = securityfs_create_file( - entry->name, 0200, safesetid_policy_dir, - entry, &safesetid_file_fops); - if (IS_ERR(entry->dentry)) { - ret = PTR_ERR(entry->dentry); - goto error; - } + policy_file = securityfs_create_file("whitelist_policy", 0200, + policy_dir, NULL, &safesetid_file_fops); + if (IS_ERR(policy_file)) { + ret = PTR_ERR(policy_file); + goto error; } return 0; error: - safesetid_shutdown_securityfs(); + securityfs_remove(policy_dir); return ret; } fs_initcall(safesetid_init_securityfs); diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 892c8e8b1b8b..4f03813d1911 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -142,23 +142,17 @@ static void ensure_securityfs_mounted(void) static void write_policies(void) { + static char *policy_str = + "1:2\n" + "1:3\n"; ssize_t written; int fd; fd = open(add_whitelist_policy_file, O_WRONLY); if (fd < 0) die("cant open add_whitelist_policy file\n"); - written = write(fd, "1:2", strlen("1:2")); - if (written != strlen("1:2")) { - if (written >= 0) { - die("short write to %s\n", add_whitelist_policy_file); - } else { - die("write to %s failed: %s\n", - add_whitelist_policy_file, strerror(errno)); - } - } - written = write(fd, "1:3", strlen("1:3")); - if (written != strlen("1:3")) { + written = write(fd, policy_str, strlen(policy_str)); + if (written != strlen(policy_str)) { if (written >= 0) { die("short write to %s\n", add_whitelist_policy_file); } else { -- cgit v1.2.3 From 4f72123da579655855301b591535a1415224f123 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 11 Apr 2019 13:12:43 -0700 Subject: LSM: SafeSetID: verify transitive constrainedness Someone might write a ruleset like the following, expecting that it securely constrains UID 1 to UIDs 1, 2 and 3: 1:2 1:3 However, because no constraints are applied to UIDs 2 and 3, an attacker with UID 1 can simply first switch to UID 2, then switch to any UID from there. The secure way to write this ruleset would be: 1:2 1:3 2:2 3:3 , which uses "transition to self" as a way to inhibit the default-allow policy without allowing anything specific. This is somewhat unintuitive. To make sure that policy authors don't accidentally write insecure policies because of this, let the kernel verify that a new ruleset does not contain any entries that are constrained, but transitively unconstrained. Signed-off-by: Jann Horn Signed-off-by: Micah Morton --- security/safesetid/securityfs.c | 38 +++++++++++++++++++++- tools/testing/selftests/safesetid/safesetid-test.c | 4 ++- 2 files changed, 40 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index 997b403c6255..d568e17dd773 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -76,6 +76,37 @@ static void release_ruleset(struct setuid_ruleset *pol) call_rcu(&pol->rcu, __release_ruleset); } +static void insert_rule(struct setuid_ruleset *pol, struct setuid_rule *rule) +{ + hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); +} + +static int verify_ruleset(struct setuid_ruleset *pol) +{ + int bucket; + struct setuid_rule *rule, *nrule; + int res = 0; + + hash_for_each(pol->rules, bucket, rule, next) { + if (_setuid_policy_lookup(pol, rule->dst_uid, INVALID_UID) == + SIDPOL_DEFAULT) { + pr_warn("insecure policy detected: uid %d is constrained but transitively unconstrained through uid %d\n", + __kuid_val(rule->src_uid), + __kuid_val(rule->dst_uid)); + res = -EINVAL; + + /* fix it up */ + nrule = kmalloc(sizeof(struct setuid_rule), GFP_KERNEL); + if (!nrule) + return -ENOMEM; + nrule->src_uid = rule->dst_uid; + nrule->dst_uid = rule->dst_uid; + insert_rule(pol, nrule); + } + } + return res; +} + static ssize_t handle_policy_update(struct file *file, const char __user *ubuf, size_t len) { @@ -128,7 +159,7 @@ static ssize_t handle_policy_update(struct file *file, goto out_free_rule; } - hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); + insert_rule(pol, rule); p = end + 1; continue; @@ -137,6 +168,11 @@ out_free_rule: goto out_free_buf; } + err = verify_ruleset(pol); + /* bogus policy falls through after fixing it up */ + if (err && err != -EINVAL) + goto out_free_buf; + /* * Everything looks good, apply the policy and release the old one. * What we really want here is an xchg() wrapper for RCU, but since that diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 4f03813d1911..8f40c6ecdad1 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -144,7 +144,9 @@ static void write_policies(void) { static char *policy_str = "1:2\n" - "1:3\n"; + "1:3\n" + "2:2\n" + "3:3\n"; ssize_t written; int fd; -- cgit v1.2.3 From c7ca0b614513afba57824cae68447f9c32b1ee61 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 15 Jul 2019 07:21:44 -0700 Subject: Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test This reverts commit 48f5e52e916b55fb73754833efbacc7f8081a159. The ptrace ABI change was a prerequisite to the proposed design for FSGSBASE. Since FSGSBASE support has been reverted, and since I'm not convinced that the ABI was ever adequately tested, revert the ABI change as well. This also modifies the test case so that it tests the preexisting behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fca39c478ea7fb15bc76fe8a36bd180810a067f6.1563200250.git.luto@kernel.org --- arch/x86/kernel/ptrace.c | 14 ++++++++++++-- tools/testing/selftests/x86/fsgsbase.c | 22 ++++------------------ 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 71691a8310e7..0fdbe89d0754 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -369,12 +369,22 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - x86_fsbase_write_task(child, value); + /* + * When changing the FS base, use do_arch_prctl_64() + * to set the index to zero and to set the base + * as requested. + */ + if (child->thread.fsbase != value) + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ if (value >= TASK_SIZE_MAX) return -EIO; - x86_gsbase_write_task(child, value); + if (child->thread.gsbase != value) + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 5ab4c60c100e..15a329da59fa 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,25 +489,11 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != *shared_scratch) { - nerrs++; - printf("[FAIL]\tGS changed to %lx\n", gs); - - /* - * On older kernels, poking a nonzero value into the - * base would zero the selector. On newer kernels, - * this behavior has changed -- poking the base - * changes only the base and, if FSGSBASE is not - * available, this may have no effect. - */ - if (gs == 0) - printf("\tNote: this is expected behavior on older kernels.\n"); - } else if (have_fsgsbase && (base != 0xFF)) { - nerrs++; - printf("[FAIL]\tGSBASE changed to %lx\n", base); + if (gs == 0 && base == 0xFF) { + printf("[OK]\tGS was reset as expected\n"); } else { - printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); - printf("\n"); + nerrs++; + printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); } } -- cgit v1.2.3 From dd13f3ca642986b59a1863101e58cd694e81f888 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:56 -0700 Subject: selftests/bpf: add trickier size resolution tests Add more BTF tests, validating that size resolution logic is correct in few trickier cases. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_btf.c | 88 ++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 8351cb5f4a20..3d617e806054 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -3417,6 +3417,94 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 1, .max_entries = 4, }, +/* + * typedef int arr_t[16]; + * struct s { + * arr_t *a; + * }; + */ +{ + .descr = "struct->ptr->typedef->array->int size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 5, 16), /* [4] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [5] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "ptr_mod_chain_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16, + .key_type_id = 5 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, +/* + * typedef int arr_t[16][8][4]; + * struct s { + * arr_t *a; + * }; + */ +{ + .descr = "struct->ptr->typedef->multi-array->int size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 7, 16), /* [4] */ + BTF_TYPE_ARRAY_ENC(6, 7, 8), /* [5] */ + BTF_TYPE_ARRAY_ENC(7, 7, 4), /* [6] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [7] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "multi_arr_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16 * 8 * 4, + .key_type_id = 7 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, +/* + * typedef int int_t; + * typedef int_t arr3_t[4]; + * typedef arr3_t arr2_t[8]; + * typedef arr2_t arr1_t[16]; + * struct s { + * arr1_t *a; + * }; + */ +{ + .descr = "typedef/multi-arr mix size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 10, 16), /* [4] */ + BTF_TYPEDEF_ENC(NAME_TBD, 6), /* [5] */ + BTF_TYPE_ARRAY_ENC(7, 10, 8), /* [6] */ + BTF_TYPEDEF_ENC(NAME_TBD, 8), /* [7] */ + BTF_TYPE_ARRAY_ENC(9, 10, 4), /* [8] */ + BTF_TYPEDEF_ENC(NAME_TBD, 10), /* [9] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [10] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr1_t\0arr2_t\0arr3_t\0int_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "typedef_arra_mix_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16 * 8 * 4, + .key_type_id = 10 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, }; /* struct btf_raw_test raw_tests[] */ -- cgit v1.2.3 From 8981e56fa17282598571958ae6a29cbc3209a6cb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:57 -0700 Subject: selftests/bpf: use typedef'ed arrays as map values Convert few tests that couldn't use typedef'ed arrays due to kernel bug. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c | 3 ++- tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c | 3 +-- tools/testing/selftests/bpf/progs/test_stacktrace_map.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c index d06b47a09097..33254b771384 100644 --- a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c +++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c @@ -47,11 +47,12 @@ struct { * issue and avoid complicated C programming massaging. * This is an acceptable workaround since there is one entry here. */ +typedef __u64 raw_stack_trace_t[2 * MAX_STACK_RAWTP]; struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 1); __type(key, __u32); - __u64 (*value)[2 * MAX_STACK_RAWTP]; + __type(value, raw_stack_trace_t); } rawdata_map SEC(".maps"); SEC("tracepoint/raw_syscalls/sys_enter") diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c index bbfc8337b6f0..f5638e26865d 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c @@ -36,8 +36,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 128); __type(key, __u32); - /* there seems to be a bug in kernel not handling typedef properly */ - struct bpf_stack_build_id (*value)[PERF_MAX_STACK_DEPTH]; + __type(value, stack_trace_t); } stack_amap SEC(".maps"); /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c index 803c15dc109d..fa0be3e10a10 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c @@ -35,7 +35,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 16384); __type(key, __u32); - __u64 (*value)[PERF_MAX_STACK_DEPTH]; + __type(value, stack_trace_t); } stack_amap SEC(".maps"); /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ -- cgit v1.2.3 From 025c0c0917b78f21e3aaffecda75968d54172095 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:41:42 +0200 Subject: selftests/bpf: fix attach_probe on s390 attach_probe test fails, because it cannot install a kprobe on a non-existent sys_nanosleep symbol. Use the correct symbol name for the nanosleep syscall on 64-bit s390. Don't bother adding one for 31-bit mode, since tests are compiled only in 64-bit mode. Fixes: 1e8611bbdfc9 ("selftests/bpf: add kprobe/uprobe selftests") Signed-off-by: Ilya Leoshkevich Acked-by: Vasily Gorbik Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a4686395522c..47af4afc5013 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -23,6 +23,8 @@ ssize_t get_base_addr() { #ifdef __x86_64__ #define SYS_KPROBE_NAME "__x64_sys_nanosleep" +#elif defined(__s390x__) +#define SYS_KPROBE_NAME "__s390x_sys_nanosleep" #else #define SYS_KPROBE_NAME "sys_nanosleep" #endif -- cgit v1.2.3 From e46fc22e60a410d896448835adca95aa3332d25d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:56:31 +0200 Subject: selftests/bpf: make directory prerequisites order-only When directories are used as prerequisites in Makefiles, they can cause a lot of unnecessary rebuilds, because a directory is considered changed whenever a file in this directory is added, removed or modified. If the only thing a target is interested in is the existence of the directory it depends on, which is the case for selftests/bpf, this directory should be specified as an order-only prerequisite: it would still be created in case it does not exist, but it would not trigger a rebuild of a target in case it's considered changed. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 277d8605e340..0e003fb6641b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -183,12 +183,12 @@ TEST_CUSTOM_PROGS += $(ALU32_BUILD_DIR)/test_progs_32 $(ALU32_BUILD_DIR): mkdir -p $@ -$(ALU32_BUILD_DIR)/urandom_read: $(OUTPUT)/urandom_read +$(ALU32_BUILD_DIR)/urandom_read: $(OUTPUT)/urandom_read | $(ALU32_BUILD_DIR) cp $< $@ $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(OUTPUT)/libbpf.a\ - $(ALU32_BUILD_DIR) \ - $(ALU32_BUILD_DIR)/urandom_read + $(ALU32_BUILD_DIR)/urandom_read \ + | $(ALU32_BUILD_DIR) $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) \ -o $(ALU32_BUILD_DIR)/test_progs_32 \ test_progs.c test_stub.c trace_helpers.c prog_tests/*.c \ @@ -197,8 +197,8 @@ $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(OUTPUT)/libbpf.a\ $(ALU32_BUILD_DIR)/test_progs_32: $(PROG_TESTS_H) $(ALU32_BUILD_DIR)/test_progs_32: prog_tests/*.c -$(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \ - $(ALU32_BUILD_DIR)/test_progs_32 +$(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR)/test_progs_32 \ + | $(ALU32_BUILD_DIR) ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ echo "clang failed") | \ $(LLC) -march=bpf -mattr=+alu32 -mcpu=$(CPU) $(LLC_FLAGS) \ @@ -236,7 +236,7 @@ $(PROG_TESTS_DIR): mkdir -p $@ PROG_TESTS_FILES := $(wildcard prog_tests/*.c) -$(PROG_TESTS_H): $(PROG_TESTS_DIR) $(PROG_TESTS_FILES) +$(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) $(shell ( cd prog_tests/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef DECLARE'; \ @@ -257,7 +257,7 @@ MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h test_maps.c: $(MAP_TESTS_H) $(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS) MAP_TESTS_FILES := $(wildcard map_tests/*.c) -$(MAP_TESTS_H): $(MAP_TESTS_DIR) $(MAP_TESTS_FILES) +$(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) $(shell ( cd map_tests/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef DECLARE'; \ @@ -279,7 +279,7 @@ $(VERIFIER_TESTS_DIR): mkdir -p $@ VERIFIER_TEST_FILES := $(wildcard verifier/*.c) -$(OUTPUT)/verifier/tests.h: $(VERIFIER_TESTS_DIR) $(VERIFIER_TEST_FILES) +$(OUTPUT)/verifier/tests.h: $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) $(shell ( cd verifier/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef FILL_ARRAY'; \ -- cgit v1.2.3 From f83a46d4711e789642033eb00fed8e017e34fe7d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:59:50 +0200 Subject: selftests/bpf: put test_stub.o into $(OUTPUT) Add a rule to put test_stub.o in $(OUTPUT) and change the references to it accordingly. This prevents test_stub.o from being created in the source directory. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0e003fb6641b..1296253b3422 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -83,13 +83,16 @@ all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c $(CC) -o $@ $< -Wl,--build-id +$(OUTPUT)/test_stub.o: test_stub.c + $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) -c -o $@ $< + $(OUTPUT)/test_maps: map_tests/*.c BPFOBJ := $(OUTPUT)/libbpf.a -$(TEST_GEN_PROGS): test_stub.o $(BPFOBJ) +$(TEST_GEN_PROGS): $(OUTPUT)/test_stub.o $(BPFOBJ) -$(TEST_GEN_PROGS_EXTENDED): test_stub.o $(OUTPUT)/libbpf.a +$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c -- cgit v1.2.3 From 8b45063c8584b3e6caff0b109dd0f47b35487aba Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:54 -0700 Subject: selftests/bpf: rename verifier/wide_store.c to verifier/wide_access.c Move the file and rename internal BPF_SOCK_ADDR define to BPF_SOCK_ADDR_STORE. This selftest will be extended in the next commit with the wide loads. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/verifier/wide_access.c | 36 ++++++++++++++++++++++ tools/testing/selftests/bpf/verifier/wide_store.c | 36 ---------------------- 2 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 tools/testing/selftests/bpf/verifier/wide_access.c delete mode 100644 tools/testing/selftests/bpf/verifier/wide_store.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c new file mode 100644 index 000000000000..3ac97328432f --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -0,0 +1,36 @@ +#define BPF_SOCK_ADDR_STORE(field, off, res, err) \ +{ \ + "wide store to bpf_sock_addr." #field "[" #off "]", \ + .insns = { \ + BPF_MOV64_IMM(BPF_REG_0, 1), \ + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \ + offsetof(struct bpf_sock_addr, field[off])), \ + BPF_EXIT_INSN(), \ + }, \ + .result = res, \ + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ + .errstr = err, \ +} + +/* user_ip6[0] is u64 aligned */ +BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT, + "invalid bpf_context access off=12 size=8"), +BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT, + "invalid bpf_context access off=20 size=8"), + +/* msg_src_ip6[0] is _not_ u64 aligned */ +BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT, + "invalid bpf_context access off=44 size=8"), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT, + "invalid bpf_context access off=52 size=8"), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, + "invalid bpf_context access off=56 size=8"), + +#undef BPF_SOCK_ADDR_STORE diff --git a/tools/testing/selftests/bpf/verifier/wide_store.c b/tools/testing/selftests/bpf/verifier/wide_store.c deleted file mode 100644 index 8fe99602ded4..000000000000 --- a/tools/testing/selftests/bpf/verifier/wide_store.c +++ /dev/null @@ -1,36 +0,0 @@ -#define BPF_SOCK_ADDR(field, off, res, err) \ -{ \ - "wide store to bpf_sock_addr." #field "[" #off "]", \ - .insns = { \ - BPF_MOV64_IMM(BPF_REG_0, 1), \ - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \ - offsetof(struct bpf_sock_addr, field[off])), \ - BPF_EXIT_INSN(), \ - }, \ - .result = res, \ - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ - .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ - .errstr = err, \ -} - -/* user_ip6[0] is u64 aligned */ -BPF_SOCK_ADDR(user_ip6, 0, ACCEPT, - NULL), -BPF_SOCK_ADDR(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), -BPF_SOCK_ADDR(user_ip6, 2, ACCEPT, - NULL), -BPF_SOCK_ADDR(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), - -/* msg_src_ip6[0] is _not_ u64 aligned */ -BPF_SOCK_ADDR(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), -BPF_SOCK_ADDR(msg_src_ip6, 1, ACCEPT, - NULL), -BPF_SOCK_ADDR(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), -BPF_SOCK_ADDR(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), - -#undef BPF_SOCK_ADDR -- cgit v1.2.3 From 7dd8d6119d481da1c4619eb7d8cfef33edfbee81 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:55 -0700 Subject: selftests/bpf: add selftests for wide loads Mirror existing wide store tests with wide loads. The only significant difference is expected error string. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/verifier/wide_access.c | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c index 3ac97328432f..ccade9312d21 100644 --- a/tools/testing/selftests/bpf/verifier/wide_access.c +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -34,3 +34,40 @@ BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, "invalid bpf_context access off=56 size=8"), #undef BPF_SOCK_ADDR_STORE + +#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \ +{ \ + "wide load from bpf_sock_addr." #field "[" #off "]", \ + .insns = { \ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, \ + offsetof(struct bpf_sock_addr, field[off])), \ + BPF_MOV64_IMM(BPF_REG_0, 1), \ + BPF_EXIT_INSN(), \ + }, \ + .result = res, \ + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ + .errstr = err, \ +} + +/* user_ip6[0] is u64 aligned */ +BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT, + "invalid bpf_context access off=12 size=8"), +BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT, + "invalid bpf_context access off=20 size=8"), + +/* msg_src_ip6[0] is _not_ u64 aligned */ +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT, + "invalid bpf_context access off=44 size=8"), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT, + "invalid bpf_context access off=52 size=8"), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT, + "invalid bpf_context access off=56 size=8"), + +#undef BPF_SOCK_ADDR_LOAD -- cgit v1.2.3 From 073a4834a81368c8af9cc9e99ff83245600a8f6b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:56 -0700 Subject: bpf: sync bpf.h to tools/ Update bpf_sock_addr comments to indicate support for 8-byte reads from user_ip6 and msg_src_ip6. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f506c68b2612..1f61374fcf81 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3245,7 +3245,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3257,7 +3257,7 @@ struct bpf_sock_addr { __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); -- cgit v1.2.3 From d5e1db990fcce571a4af2434d6fc2b312d94b45e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:44:41 -0700 Subject: selftests/bpf: remove logic duplication in test_verifier test_verifier tests can specify single- and multi-runs tests. Internally logic of handling them is duplicated. Get rid of it by making single run retval/data specification to be a first run spec. Signed-off-by: Andrii Nakryiko Cc: Krzesimir Nowak Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_verifier.c | 35 +++++++++++++---------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b0773291012a..84135d5f4b35 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -86,7 +86,7 @@ struct bpf_test { int fixup_sk_storage_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; - uint32_t retval, retval_unpriv, insn_processed; + uint32_t insn_processed; int prog_len; enum { UNDEF, @@ -95,16 +95,20 @@ struct bpf_test { } result, result_unpriv; enum bpf_prog_type prog_type; uint8_t flags; - __u8 data[TEST_DATA_LEN]; void (*fill_helper)(struct bpf_test *self); uint8_t runs; - struct { - uint32_t retval, retval_unpriv; - union { - __u8 data[TEST_DATA_LEN]; - __u64 data64[TEST_DATA_LEN / 8]; - }; - } retvals[MAX_TEST_RUNS]; +#define bpf_testdata_struct_t \ + struct { \ + uint32_t retval, retval_unpriv; \ + union { \ + __u8 data[TEST_DATA_LEN]; \ + __u64 data64[TEST_DATA_LEN / 8]; \ + }; \ + } + union { + bpf_testdata_struct_t; + bpf_testdata_struct_t retvals[MAX_TEST_RUNS]; + }; enum bpf_attach_type expected_attach_type; }; @@ -949,17 +953,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, uint32_t expected_val; int i; - if (!test->runs) { - expected_val = unpriv && test->retval_unpriv ? - test->retval_unpriv : test->retval; - - err = do_prog_test_run(fd_prog, unpriv, expected_val, - test->data, sizeof(test->data)); - if (err) - run_errs++; - else - run_successes++; - } + if (!test->runs) + test->runs = 1; for (i = 0; i < test->runs; i++) { if (unpriv && test->retvals[i].retval_unpriv) -- cgit v1.2.3 From 3461a0a02141b2612f2916e9250a430de41b0290 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 12:53:53 +0200 Subject: selftests/bpf: fix "alu with different scalars 1" on s390 BPF_LDX_MEM is used to load the least significant byte of the retrieved test_val.index, however, on big-endian machines it ends up retrieving the most significant byte. Change the test to load the whole int in order to make it endianness-independent. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/value_ptr_arith.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index c3de1a2c9dc5..a53d99cebd9f 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -183,7 +183,7 @@ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0x100000), -- cgit v1.2.3 From 4e59afbbed9638f95adfac8d2f222aca3b2b1bc0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 12:56:34 +0200 Subject: selftests/bpf: skip nmi test when perf hw events are disabled Some setups (e.g. virtual machines) might run with hardware perf events disabled. If this is the case, skip the test_send_signal_nmi test. Add a separate test involving a software perf event. This allows testing the perf event path regardless of hardware perf event support. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 67cea1686305..54218ee3c004 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -173,6 +173,18 @@ static int test_send_signal_tracepoint(void) return test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint"); } +static int test_send_signal_perf(void) +{ + struct perf_event_attr attr = { + .sample_period = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + + return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, + "perf_sw_event"); +} + static int test_send_signal_nmi(void) { struct perf_event_attr attr = { @@ -181,8 +193,26 @@ static int test_send_signal_nmi(void) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, }; + int pmu_fd; + + /* Some setups (e.g. virtual machines) might run with hardware + * perf events disabled. If this is the case, skip this test. + */ + pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, + -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); + if (pmu_fd == -1) { + if (errno == ENOENT) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", + __func__); + return 0; + } + /* Let the test fail with a more informative message */ + } else { + close(pmu_fd); + } - return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, "perf_event"); + return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, + "perf_hw_event"); } void test_send_signal(void) @@ -190,6 +220,7 @@ void test_send_signal(void) int ret = 0; ret |= test_send_signal_tracepoint(); + ret |= test_send_signal_perf(); ret |= test_send_signal_nmi(); if (!ret) printf("test_send_signal:OK\n"); -- cgit v1.2.3 From 763ff0e7d9c72e7094b31e7fb84a859be9325635 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jul 2019 20:57:03 -0700 Subject: libbpf: fix another GCC8 warning for strncpy Similar issue was fixed in cdfc7f888c2a ("libbpf: fix GCC8 warning for strncpy") already. This one was missed. Fixing now. Cc: Magnus Karlsson Signed-off-by: Andrii Nakryiko Acked-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index b33740221b7e..5007b5d4fd2c 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -517,7 +517,8 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, err = -errno; goto out_socket; } - strncpy(xsk->ifname, ifname, IFNAMSIZ); + strncpy(xsk->ifname, ifname, IFNAMSIZ - 1); + xsk->ifname[IFNAMSIZ - 1] = '\0'; err = xsk_set_xdp_socket_config(&xsk->config, usr_config); if (err) -- cgit v1.2.3 From cbd965bde74c3d0e53798404fd4b9829a68ccec8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 3 Jul 2019 14:40:42 -0400 Subject: ftrace/selftests: Return the skip code when tracing directory not configured in kernel If the kernel is not configured with ftrace enabled, the ftracetest selftests should return the error code of "4" as that is the kselftests "skip" code, and not "1" which means an error. To determine if ftrace is enabled, first the newer "tracefs" is searched for in /proc/mounts. If it is not found, then "debugfs" is searched for (as old kernels do not have tracefs). If that is not found, an attempt to mount the tracefs or debugfs is performed. This is done by seeing first if the /sys/kernel/tracing directory exists. If it does than tracefs is configured in the kernel and an attempt to mount it is performed. If /sys/kernel/tracing does not exist, then /sys/kernel/debug is tested to see if that directory exists. If it does, then an attempt to mount debugfs on that directory is performed. If it does not exist, then debugfs is not configured in the running kernel and the test exits with the skip code. If either mount fails, then a normal error is returned as they do exist in the kernel but something went wrong to mount them. This changes the test to always try the tracefs file system first as it has been in the kernel for some time now and it is better to test it if it is available instead of always testing debugfs. Link: http://lkml.kernel.org/r/20190702062358.7330-1-po-hsu.lin@canonical.com Reported-by: Po-Hsu Lin Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/ftracetest | 38 ++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 136387422b00..edf5ea35d2a7 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -23,9 +23,15 @@ echo " If is -, all logs output in console only" exit $1 } +# default error +err_ret=1 + +# kselftest skip code is 4 +err_skip=4 + errexit() { # message echo "Error: $1" 1>&2 - exit 1 + exit $err_ret } # Ensuring user privilege @@ -116,11 +122,31 @@ parse_opts() { # opts } # Parameters -DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` -if [ -z "$DEBUGFS_DIR" ]; then - TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` -else - TRACING_DIR=$DEBUGFS_DIR/tracing +TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` +if [ -z "$TRACING_DIR" ]; then + DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` + if [ -z "$DEBUGFS_DIR" ]; then + # If tracefs exists, then so does /sys/kernel/tracing + if [ -d "/sys/kernel/tracing" ]; then + mount -t tracefs nodev /sys/kernel/tracing || + errexit "Failed to mount /sys/kernel/tracing" + TRACING_DIR="/sys/kernel/tracing" + # If debugfs exists, then so does /sys/kernel/debug + elif [ -d "/sys/kernel/debug" ]; then + mount -t debugfs nodev /sys/kernel/debug || + errexit "Failed to mount /sys/kernel/debug" + TRACING_DIR="/sys/kernel/debug/tracing" + else + err_ret=$err_skip + errexit "debugfs and tracefs are not configured in this kernel" + fi + else + TRACING_DIR="$DEBUGFS_DIR/tracing" + fi +fi +if [ ! -d "$TRACING_DIR" ]; then + err_ret=$err_skip + errexit "ftrace is not configured in this kernel" fi TOP_DIR=`absdir $0` -- cgit v1.2.3 From 6e55f320f00e4db7144a4906dd8ac53701891e31 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 3 Jul 2019 15:46:08 -0400 Subject: ftrace/selftest: Test if set_event/ftrace_pid exists before writing While testing on a very old kernel (3.5), the tests failed because the write to set_event_pid in the setup code, did not exist. The tests themselves could pass, but the setup failed causing an error. Other files test for existance before writing to them. Do the same for set_event_pid and set_ftrace_pid. Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/test.d/functions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 779ec11f61bd..1d96c5f7e402 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -91,8 +91,8 @@ initialize_ftrace() { # Reset ftrace to initial-state reset_events_filter reset_ftrace_filter disable_events - echo > set_event_pid # event tracer is always on - echo > set_ftrace_pid + [ -f set_event_pid ] && echo > set_event_pid + [ -f set_ftrace_pid ] && echo > set_ftrace_pid [ -f set_ftrace_filter ] && echo | tee set_ftrace_* [ -f set_graph_function ] && echo | tee set_graph_* [ -f stack_trace_filter ] && echo > stack_trace_filter -- cgit v1.2.3 From 36646b22ce2455c25c76c9023a0e5ca58d62899a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 16 Jul 2019 12:38:36 -0700 Subject: selftests/bpf: fix test_verifier/test_maps make dependencies e46fc22e60a4 ("selftests/bpf: make directory prerequisites order-only") exposed existing problem in Makefile for test_verifier and test_maps tests: their dependency on auto-generated header file with a list of all tests wasn't recorded explicitly. This patch fixes these issues. Fixes: 51a0e301a563 ("bpf: Add BPF_MAP_TYPE_SK_STORAGE test to test_maps") Fixes: 6b7b6995c43e ("selftests: bpf: tests.h should depend on .c files, not the output") Cc: Ilya Leoshkevich Cc: Stanislav Fomichev Cc: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1296253b3422..9bc68d8abc5f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -86,8 +86,6 @@ $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c $(OUTPUT)/test_stub.o: test_stub.c $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) -c -o $@ $< -$(OUTPUT)/test_maps: map_tests/*.c - BPFOBJ := $(OUTPUT)/libbpf.a $(TEST_GEN_PROGS): $(OUTPUT)/test_stub.o $(BPFOBJ) @@ -257,9 +255,10 @@ MAP_TESTS_DIR = $(OUTPUT)/map_tests $(MAP_TESTS_DIR): mkdir -p $@ MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h +MAP_TESTS_FILES := $(wildcard map_tests/*.c) test_maps.c: $(MAP_TESTS_H) $(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS) -MAP_TESTS_FILES := $(wildcard map_tests/*.c) +$(OUTPUT)/test_maps: test_maps.c $(MAP_TESTS_H) $(MAP_TESTS_FILES) $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) $(shell ( cd map_tests/; \ echo '/* Generated header, do not edit */'; \ @@ -276,6 +275,7 @@ $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h test_verifier.c: $(VERIFIER_TESTS_H) $(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) +$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) VERIFIER_TESTS_DIR = $(OUTPUT)/verifier $(VERIFIER_TESTS_DIR): -- cgit v1.2.3 From 9d1f62a6dcf0de3ab184c1f7b7d82117dcbab090 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 16 Jul 2019 12:38:37 -0700 Subject: selftests/bpf: structure test_{progs, maps, verifier} test runners uniformly It's easier to follow the logic if it's structured the same. There is just slight difference between test_progs/test_maps and test_verifier. test_verifier's verifier/*.c files are not really compilable C files (they are more of include headers), so they can't be specified as explicit dependencies of test_verifier. Cc: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9bc68d8abc5f..11c9c62c3362 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -176,6 +176,7 @@ endif endif TEST_PROGS_CFLAGS := -I. -I$(OUTPUT) +TEST_MAPS_CFLAGS := -I. -I$(OUTPUT) TEST_VERIFIER_CFLAGS := -I. -I$(OUTPUT) -Iverifier ifneq ($(SUBREG_CODEGEN),) @@ -227,16 +228,14 @@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif -PROG_TESTS_H := $(OUTPUT)/prog_tests/tests.h -test_progs.c: $(PROG_TESTS_H) -$(OUTPUT)/test_progs: CFLAGS += $(TEST_PROGS_CFLAGS) -$(OUTPUT)/test_progs: prog_tests/*.c - PROG_TESTS_DIR = $(OUTPUT)/prog_tests $(PROG_TESTS_DIR): mkdir -p $@ - +PROG_TESTS_H := $(PROG_TESTS_DIR)/tests.h PROG_TESTS_FILES := $(wildcard prog_tests/*.c) +test_progs.c: $(PROG_TESTS_H) +$(OUTPUT)/test_progs: CFLAGS += $(TEST_PROGS_CFLAGS) +$(OUTPUT)/test_progs: test_progs.c $(PROG_TESTS_H) $(PROG_TESTS_FILES) $(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) $(shell ( cd prog_tests/; \ echo '/* Generated header, do not edit */'; \ @@ -250,7 +249,6 @@ $(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) echo '#endif' \ ) > $(PROG_TESTS_H)) -TEST_MAPS_CFLAGS := -I. -I$(OUTPUT) MAP_TESTS_DIR = $(OUTPUT)/map_tests $(MAP_TESTS_DIR): mkdir -p $@ @@ -272,17 +270,15 @@ $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) echo '#endif' \ ) > $(MAP_TESTS_H)) -VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h -test_verifier.c: $(VERIFIER_TESTS_H) -$(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) -$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) - VERIFIER_TESTS_DIR = $(OUTPUT)/verifier $(VERIFIER_TESTS_DIR): mkdir -p $@ - +VERIFIER_TESTS_H := $(VERIFIER_TESTS_DIR)/tests.h VERIFIER_TEST_FILES := $(wildcard verifier/*.c) -$(OUTPUT)/verifier/tests.h: $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) +test_verifier.c: $(VERIFIER_TESTS_H) +$(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) +$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) +$(VERIFIER_TESTS_H): $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) $(shell ( cd verifier/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef FILL_ARRAY'; \ -- cgit v1.2.3 From 1cb59a6074e23c6f6513642f752a6d8d38327354 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 14:58:27 +0200 Subject: selftests/bpf: fix perf_buffer on s390 perf_buffer test fails for exactly the same reason test_attach_probe used to fail: different nanosleep syscall kprobe name. Reuse the test_attach_probe fix. Fixes: ee5cf82ce04a ("selftests/bpf: test perf buffer API") Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 12 ++---------- tools/testing/selftests/bpf/prog_tests/perf_buffer.c | 8 +------- tools/testing/selftests/bpf/test_progs.h | 8 ++++++++ 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 47af4afc5013..5ecc267d98b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -21,14 +21,6 @@ ssize_t get_base_addr() { return -EINVAL; } -#ifdef __x86_64__ -#define SYS_KPROBE_NAME "__x64_sys_nanosleep" -#elif defined(__s390x__) -#define SYS_KPROBE_NAME "__s390x_sys_nanosleep" -#else -#define SYS_KPROBE_NAME "sys_nanosleep" -#endif - void test_attach_probe(void) { const char *kprobe_name = "kprobe/sys_nanosleep"; @@ -86,7 +78,7 @@ void test_attach_probe(void) kprobe_link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", "err %ld\n", PTR_ERR(kprobe_link))) { kprobe_link = NULL; @@ -94,7 +86,7 @@ void test_attach_probe(void) } kretprobe_link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe", "err %ld\n", PTR_ERR(kretprobe_link))) { kretprobe_link = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 3f1ef95865ff..3003fddc0613 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -5,12 +5,6 @@ #include #include -#ifdef __x86_64__ -#define SYS_KPROBE_NAME "__x64_sys_nanosleep" -#else -#define SYS_KPROBE_NAME "sys_nanosleep" -#endif - static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; @@ -56,7 +50,7 @@ void test_perf_buffer(void) /* attach kprobe */ link = bpf_program__attach_kprobe(prog, false /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link))) goto out_close; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f095e1d4c657..49e0f7d85643 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -92,3 +92,11 @@ int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); void *spin_lock_thread(void *arg); + +#ifdef __x86_64__ +#define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" +#elif defined(__s390x__) +#define SYS_NANOSLEEP_KPROBE_NAME "__s390x_sys_nanosleep" +#else +#define SYS_NANOSLEEP_KPROBE_NAME "sys_nanosleep" +#endif -- cgit v1.2.3 From bca1eac55a940025065645158c1a3429ac697df6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:36 -0700 Subject: tools/testing/selftests/proc/proc-pid-vm.c: hide "segfault at ffffffffff600000" dmesg spam Test tries to access vsyscall page and if it doesn't exist gets SIGSEGV which can spam into dmesg. However the segfault happens by design. Handle it and carry information via exit code to parent. Link: http://lkml.kernel.org/r/20190524181256.GA2260@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/proc-pid-vm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index 853aa164a401..18a3bde8bc96 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -215,6 +215,11 @@ static const char str_vsyscall[] = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ +static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) +{ + _exit(1); +} + /* * vsyscall page can't be unmapped, probe it with memory load. */ @@ -231,11 +236,19 @@ static void vsyscall(void) if (pid == 0) { struct rlimit rlim = {0, 0}; (void)setrlimit(RLIMIT_CORE, &rlim); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + (void)sigaction(SIGSEGV, &act, NULL); + *(volatile int *)0xffffffffff600000UL; exit(0); } - wait(&wstatus); - if (WIFEXITED(wstatus)) { + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { g_vsyscall = true; } } -- cgit v1.2.3 From 7dbbade1f285e881119049563ab2a036c96dd9f3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:48 -0700 Subject: proc: test /proc/sysvipc vs setns(CLONE_NEWIPC) I thought that /proc/sysvipc has the same bug as /proc/net commit 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 proc: fix /proc/net/* after setns(2) However, it doesn't! /proc/sysvipc files do get_ipc_ns(current->nsproxy->ipc_ns); in their open() hook and avoid the problem. Keep the test, maybe /proc/sysvipc will become broken someday :-\ Link: http://lkml.kernel.org/r/20190706180146.GA21015@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/setns-sysvipc.c | 133 +++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/proc/setns-sysvipc.c (limited to 'tools') diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 444ad39d3700..66fab4c58ed4 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -12,4 +12,5 @@ /read /self /setns-dcache +/setns-sysvipc /thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 9f09fcd09ea3..a8ed0f684829 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -17,6 +17,7 @@ TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache +TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/setns-sysvipc.c b/tools/testing/selftests/proc/setns-sysvipc.c new file mode 100644 index 000000000000..903890c5e587 --- /dev/null +++ b/tools/testing/selftests/proc/setns-sysvipc.c @@ -0,0 +1,133 @@ +/* + * Copyright © 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that setns(CLONE_NEWIPC) points to new /proc/sysvipc content even + * if old one is in dcache. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t pid = -1; + +static void f(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +int main(void) +{ + int fd[2]; + char _ = 0; + int nsfd; + + atexit(f); + + /* Check for priviledges and syscall availability straight away. */ + if (unshare(CLONE_NEWIPC) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + /* Distinguisher between two otherwise empty IPC namespaces. */ + if (shmget(IPC_PRIVATE, 1, IPC_CREAT) == -1) { + return 1; + } + + if (pipe(fd) == -1) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWIPC) == -1) { + return 1; + } + + if (write(fd[1], &_, 1) != 1) { + return 1; + } + + pause(); + + return 0; + } + + if (read(fd[0], &_, 1) != 1) { + return 1; + } + + { + char buf[64]; + snprintf(buf, sizeof(buf), "/proc/%u/ns/ipc", pid); + nsfd = open(buf, O_RDONLY); + if (nsfd == -1) { + return 1; + } + } + + /* Reliably pin dentry into dcache. */ + (void)open("/proc/sysvipc/shm", O_RDONLY); + + if (setns(nsfd, CLONE_NEWIPC) == -1) { + return 1; + } + + kill(pid, SIGTERM); + pid = 0; + + { + char buf[4096]; + ssize_t rv; + int fd; + + fd = open("/proc/sysvipc/shm", O_RDONLY); + if (fd == -1) { + return 1; + } + +#define S32 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n" +#define S64 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n" + rv = read(fd, buf, sizeof(buf)); + if (rv == strlen(S32)) { + assert(memcmp(buf, S32, strlen(S32)) == 0); + } else if (rv == strlen(S64)) { + assert(memcmp(buf, S64, strlen(S64)) == 0); + } else { + assert(0); + } + } + + return 0; +} -- cgit v1.2.3 From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Tue, 16 Jul 2019 16:29:42 -0700 Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain details of the syscall the tracee is blocked in. There are two reasons for a special syscall-related ptrace request. Firstly, with the current ptrace API there are cases when ptracer cannot retrieve necessary information about syscalls. Some examples include: * The notorious int-0x80-from-64-bit-task issue. See [1] for details. In short, if a 64-bit task performs a syscall through int 0x80, its tracer has no reliable means to find out that the syscall was, in fact, a compat syscall, and misidentifies it. * Syscall-enter-stop and syscall-exit-stop look the same for the tracer. Common practice is to keep track of the sequence of ptrace-stops in order not to mix the two syscall-stops up. But it is not as simple as it looks; for example, strace had a (just recently fixed) long-standing bug where attaching strace to a tracee that is performing the execve system call led to the tracer identifying the following syscall-exit-stop as syscall-enter-stop, which messed up all the state tracking. * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA and process_vm_readv become unavailable when the process dumpable flag is cleared. On such architectures as ia64 this results in all syscall arguments being unavailable for the tracer. Secondly, ptracers also have to support a lot of arch-specific code for obtaining information about the tracee. For some architectures, this requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall argument and return value. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_GET_SYSCALL_INFO Retrieve information about the syscall that caused the stop. The information is placed into the buffer pointed by "data" argument, which should be a pointer to a buffer of type "struct ptrace_syscall_info". The "addr" argument contains the size of the buffer pointed to by "data" argument (i.e., sizeof(struct ptrace_syscall_info)). The return value contains the number of bytes available to be written by the kernel. If the size of data to be written by the kernel exceeds the size specified by "addr" argument, the output is truncated. [ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO] Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org Signed-off-by: Elvira Khabirova Co-developed-by: Dmitry V. Levin Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Shuah Khan Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 9 ++- include/uapi/linux/ptrace.h | 35 +++++++++ kernel/ptrace.c | 101 +++++++++++++++++++++++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 +++- 4 files changed, 150 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8446573cc682..36fb3bbed6b2 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -54,13 +54,15 @@ struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ -static inline int ptrace_report_syscall(struct pt_regs *regs) +static inline int ptrace_report_syscall(struct pt_regs *regs, + unsigned long message) { int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return 0; + current->ptrace_message = message; ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) current->exit_code = 0; } + current->ptrace_message = 0; return fatal_signal_pending(current); } @@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs); + return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** @@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) if (step) user_single_step_report(regs); else - ptrace_report_syscall(regs); + ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); } /** diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index d5a1b8a492b9..a71b6e3b03eb 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -73,6 +73,41 @@ struct seccomp_metadata { __u64 flags; /* Output: filter's flags */ }; +#define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SYSCALL_INFO_NONE 0 +#define PTRACE_SYSCALL_INFO_ENTRY 1 +#define PTRACE_SYSCALL_INFO_EXIT 2 +#define PTRACE_SYSCALL_INFO_SECCOMP 3 + +struct ptrace_syscall_info { + __u8 op; /* PTRACE_SYSCALL_INFO_* */ + __u32 arch __attribute__((__aligned__(sizeof(__u32)))); + __u64 instruction_pointer; + __u64 stack_pointer; + union { + struct { + __u64 nr; + __u64 args[6]; + } entry; + struct { + __s64 rval; + __u8 is_error; + } exit; + struct { + __u64 nr; + __u64 args[6]; + __u32 ret_data; + } seccomp; + }; +}; + +/* + * These values are stored in task->ptrace_message + * by tracehook_report_syscall_* to describe the current syscall-stop. + */ +#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 +#define PTRACE_EVENTMSG_SYSCALL_EXIT 2 + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include #include +#include /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index dc66fe852768..6ef7f16c4cf5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, unsigned long msg; static bool entry; - /* Make sure we got an empty message. */ + /* + * The traditional way to tell PTRACE_SYSCALL entry/exit + * is by counting. + */ + entry = !entry; + + /* Make sure we got an appropriate message. */ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); EXPECT_EQ(0, ret); - EXPECT_EQ(0, msg); + EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY + : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); - /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */ - entry = !entry; if (!entry) return; -- cgit v1.2.3 From ac76de555d76b8cc7f8ef231692a3ad9cbd0ce63 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 16 Jul 2019 16:29:46 -0700 Subject: selftests/ptrace: add a test case for PTRACE_GET_SYSCALL_INFO Check whether PTRACE_GET_SYSCALL_INFO semantics implemented in the kernel matches userspace expectations. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190510152852.GG28558@altlinux.org Signed-off-by: Dmitry V. Levin Acked-by: Shuah Khan Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Kees Cook Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/ptrace/.gitignore | 1 + tools/testing/selftests/ptrace/Makefile | 2 +- tools/testing/selftests/ptrace/get_syscall_info.c | 271 ++++++++++++++++++++++ 3 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/ptrace/get_syscall_info.c (limited to 'tools') diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index b3e59d41fd82..cfcc49a7def7 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -1 +1,2 @@ +get_syscall_info peeksiginfo diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index cb21c76a18ca..c0b7f89f0930 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -iquote../../../../include/uapi -Wall -TEST_GEN_PROGS := peeksiginfo +TEST_GEN_PROGS := get_syscall_info peeksiginfo include ../lib.mk diff --git a/tools/testing/selftests/ptrace/get_syscall_info.c b/tools/testing/selftests/ptrace/get_syscall_info.c new file mode 100644 index 000000000000..5bcd1c7b5be6 --- /dev/null +++ b/tools/testing/selftests/ptrace/get_syscall_info.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2018 Dmitry V. Levin + * All rights reserved. + * + * Check whether PTRACE_GET_SYSCALL_INFO semantics implemented in the kernel + * matches userspace expectations. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include "linux/ptrace.h" + +static int +kill_tracee(pid_t pid) +{ + if (!pid) + return 0; + + int saved_errno = errno; + + int rc = kill(pid, SIGKILL); + + errno = saved_errno; + return rc; +} + +static long +sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data) +{ + return syscall(__NR_ptrace, request, pid, addr, data); +} + +#define LOG_KILL_TRACEE(fmt, ...) \ + do { \ + kill_tracee(pid); \ + TH_LOG("wait #%d: " fmt, \ + ptrace_stop, ##__VA_ARGS__); \ + } while (0) + +TEST(get_syscall_info) +{ + static const unsigned long args[][7] = { + /* a sequence of architecture-agnostic syscalls */ + { + __NR_chdir, + (unsigned long) "", + 0xbad1fed1, + 0xbad2fed2, + 0xbad3fed3, + 0xbad4fed4, + 0xbad5fed5 + }, + { + __NR_gettid, + 0xcaf0bea0, + 0xcaf1bea1, + 0xcaf2bea2, + 0xcaf3bea3, + 0xcaf4bea4, + 0xcaf5bea5 + }, + { + __NR_exit_group, + 0, + 0xfac1c0d1, + 0xfac2c0d2, + 0xfac3c0d3, + 0xfac4c0d4, + 0xfac5c0d5 + } + }; + const unsigned long *exp_args; + + pid_t pid = fork(); + + ASSERT_LE(0, pid) { + TH_LOG("fork: %m"); + } + + if (pid == 0) { + /* get the pid before PTRACE_TRACEME */ + pid = getpid(); + ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) { + TH_LOG("PTRACE_TRACEME: %m"); + } + ASSERT_EQ(0, kill(pid, SIGSTOP)) { + /* cannot happen */ + TH_LOG("kill SIGSTOP: %m"); + } + for (unsigned int i = 0; i < ARRAY_SIZE(args); ++i) { + syscall(args[i][0], + args[i][1], args[i][2], args[i][3], + args[i][4], args[i][5], args[i][6]); + } + /* unreachable */ + _exit(1); + } + + const struct { + unsigned int is_error; + int rval; + } *exp_param, exit_param[] = { + { 1, -ENOENT }, /* chdir */ + { 0, pid } /* gettid */ + }; + + unsigned int ptrace_stop; + + for (ptrace_stop = 0; ; ++ptrace_stop) { + struct ptrace_syscall_info info = { + .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */ + }; + const size_t size = sizeof(info); + const int expected_none_size = + (void *) &info.entry - (void *) &info; + const int expected_entry_size = + (void *) &info.entry.args[6] - (void *) &info; + const int expected_exit_size = + (void *) (&info.exit.is_error + 1) - + (void *) &info; + int status; + long rc; + + ASSERT_EQ(pid, wait(&status)) { + /* cannot happen */ + LOG_KILL_TRACEE("wait: %m"); + } + if (WIFEXITED(status)) { + pid = 0; /* the tracee is no more */ + ASSERT_EQ(0, WEXITSTATUS(status)); + break; + } + ASSERT_FALSE(WIFSIGNALED(status)) { + pid = 0; /* the tracee is no more */ + LOG_KILL_TRACEE("unexpected signal %u", + WTERMSIG(status)); + } + ASSERT_TRUE(WIFSTOPPED(status)) { + /* cannot happen */ + LOG_KILL_TRACEE("unexpected wait status %#x", status); + } + + switch (WSTOPSIG(status)) { + case SIGSTOP: + ASSERT_EQ(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected signal stop"); + } + ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, pid, 0, + PTRACE_O_TRACESYSGOOD)) { + LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m"); + } + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + pid, size, + (unsigned long) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + ASSERT_EQ(expected_none_size, rc) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_NONE, info.op) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + break; + + case SIGTRAP | 0x80: + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + pid, size, + (unsigned long) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + switch (ptrace_stop) { + case 1: /* entering chdir */ + case 3: /* entering gettid */ + case 5: /* entering exit_group */ + exp_args = args[ptrace_stop / 2]; + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info.op) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[0], info.entry.nr) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[1], info.entry.args[0]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[2], info.entry.args[1]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[3], info.entry.args[2]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[4], info.entry.args[3]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[5], info.entry.args[4]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[6], info.entry.args[5]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + break; + case 2: /* exiting chdir */ + case 4: /* exiting gettid */ + exp_param = &exit_param[ptrace_stop / 2 - 1]; + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info.op) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(exp_param->is_error, + info.exit.is_error) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(exp_param->rval, info.exit.rval) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + break; + default: + LOG_KILL_TRACEE("unexpected syscall stop"); + abort(); + } + break; + + default: + LOG_KILL_TRACEE("unexpected stop signal %#x", + WSTOPSIG(status)); + abort(); + } + + ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, pid, 0, 0)) { + LOG_KILL_TRACEE("PTRACE_SYSCALL: %m"); + } + } + + ASSERT_EQ(ARRAY_SIZE(args) * 2, ptrace_stop); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From b7dca6dd1e591ad19a9aae716f3898be8063f880 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 17 Jul 2019 15:17:57 +0900 Subject: kbuild: create *.mod with full directory path and remove MODVERDIR While descending directories, Kbuild produces objects for modules, but do not link final *.ko files; it is done in the modpost. To keep track of modules, Kbuild creates a *.mod file in $(MODVERDIR) for every module it is building. Some post-processing steps read the necessary information from *.mod files. This avoids descending into directories again. This mechanism was introduced in 2003 or so. Later, commit 551559e13af1 ("kbuild: implement modules.order") added modules.order. So, we can simply read it out to know all the modules with directory paths. This is easier than parsing the first line of *.mod files. $(MODVERDIR) has a flat directory structure, that is, *.mod files are named only with base names. This is based on the assumption that the module name is unique across the tree. This assumption is really fragile. Stephen Rothwell reported a race condition caused by a module name conflict: https://lkml.org/lkml/2019/5/13/991 In parallel building, two different threads could write to the same $(MODVERDIR)/*.mod simultaneously. Non-unique module names are the source of all kind of troubles, hence commit 3a48a91901c5 ("kbuild: check uniqueness of module names") introduced a new checker script. However, it is still fragile in the build system point of view because this race happens before scripts/modules-check.sh is invoked. If it happens again, the modpost will emit unclear error messages. To fix this issue completely, create *.mod with full directory path so that two threads never attempt to write to the same file. $(MODVERDIR) is no longer needed. Since modules with directory paths are listed in modules.order, Kbuild is still able to find *.mod files without additional descending. I also killed cmd_secanalysis; scripts/mod/sumversion.c computes MD4 hash for modules with MODULE_VERSION(). When CONFIG_DEBUG_SECTION_MISMATCH=y, it occurs not only in the modpost stage, but also during directory descending, where sumversion.c may parse stale *.mod files. It would emit 'No such file or directory' warning when an object consisting a module is renamed, or when a single-obj module is turned into a multi-obj module or vice versa. Signed-off-by: Masahiro Yamada Acked-by: Nicolas Pitre --- .gitignore | 1 + Documentation/dontdiff | 1 + Makefile | 20 +++----------------- lib/Kconfig.debug | 12 +----------- scripts/Makefile.build | 15 +++------------ scripts/Makefile.modpost | 4 ++-- scripts/adjust_autoksyms.sh | 14 +++++--------- scripts/mod/sumversion.c | 16 +++------------- scripts/package/mkspec | 2 +- tools/power/cpupower/debug/kernel/Makefile | 4 ++-- 10 files changed, 22 insertions(+), 67 deletions(-) (limited to 'tools') diff --git a/.gitignore b/.gitignore index 7587ef56b92d..8f5422cba6e2 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ *.lz4 *.lzma *.lzo +*.mod *.mod.c *.o *.o.* diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 5eba889ea84d..9f4392876099 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -30,6 +30,7 @@ *.lzo *.mo *.moc +*.mod *.mod.c *.o *.o.* diff --git a/Makefile b/Makefile index d9103e731187..6308c8cea121 100644 --- a/Makefile +++ b/Makefile @@ -486,11 +486,6 @@ export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS -# When compiling out-of-tree modules, put MODVERDIR in the module -# tree rather than in the kernel tree. The kernel tree might -# even be read-only. -export MODVERDIR := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_versions - # Files to ignore in find ... statements export RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o \ @@ -1029,8 +1024,8 @@ vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_OBJS) $(KBUILD_VMLINUX_LIBS) # Recurse until adjust_autoksyms.sh is satisfied PHONY += autoksyms_recursive -autoksyms_recursive: $(vmlinux-deps) ifdef CONFIG_TRIM_UNUSED_KSYMS +autoksyms_recursive: $(vmlinux-deps) modules.order $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \ "$(MAKE) -f $(srctree)/Makefile vmlinux" endif @@ -1113,7 +1108,6 @@ endif prepare1: prepare3 outputmakefile asm-generic $(version_h) $(autoksyms_h) \ include/generated/utsrelease.h - $(cmd_crmodverdir) archprepare: archheaders archscripts prepare1 scripts @@ -1371,7 +1365,7 @@ endif # CONFIG_MODULES # make distclean Remove editor backup files, patch leftover files and the like # Directories & files removed with 'make clean' -CLEAN_DIRS += $(MODVERDIR) include/ksym +CLEAN_DIRS += include/ksym CLEAN_FILES += modules.builtin.modinfo # Directories & files removed with 'make mrproper' @@ -1641,7 +1635,6 @@ PHONY += $(clean-dirs) clean $(clean-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@) -clean: rm-dirs := $(MODVERDIR) clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers PHONY += help @@ -1655,8 +1648,6 @@ help: @echo '' PHONY += prepare -prepare: - $(cmd_crmodverdir) endif # KBUILD_EXTMOD clean: $(clean-dirs) @@ -1667,7 +1658,7 @@ clean: $(clean-dirs) -o -name '*.ko.*' \ -o -name '*.dtb' -o -name '*.dtb.S' -o -name '*.dt.yaml' \ -o -name '*.dwo' -o -name '*.lst' \ - -o -name '*.su' \ + -o -name '*.su' -o -name '*.mod' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.lex.c' -o -name '*.tab.[ch]' \ -o -name '*.asn1.[ch]' \ @@ -1794,11 +1785,6 @@ quiet_cmd_depmod = DEPMOD $(KERNELRELEASE) cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \ $(KERNELRELEASE) -# Create temporary dir for module support files -# clean it up only when building all modules -cmd_crmodverdir = $(Q)mkdir -p $(MODVERDIR) \ - $(if $(KBUILD_MODULES),; rm -f $(MODVERDIR)/*) - # read saved command lines for existing targets existing-targets := $(wildcard $(sort $(targets))) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4ac4ca21a30a..cde5675340ba 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -353,23 +353,13 @@ config DEBUG_SECTION_MISMATCH which results in the code/data being placed in specific sections. The section mismatch analysis is always performed after a full kernel build, and enabling this option causes the following - additional steps to occur: + additional step to occur: - Add the option -fno-inline-functions-called-once to gcc commands. When inlining a function annotated with __init in a non-init function, we would lose the section information and thus the analysis would not catch the illegal reference. This option tells gcc to inline less (but it does result in a larger kernel). - - Run the section mismatch analysis for each module/built-in.a file. - When we run the section mismatch analysis on vmlinux.o, we - lose valuable information about where the mismatch was - introduced. - Running the analysis for each module/built-in.a file - tells where the mismatch happens much closer to the - source. The drawback is that the same mismatch is - reported at least twice. - - Enable verbose reporting from modpost in order to help resolve - the section mismatches that are reported. config SECTION_MISMATCH_WARN_ONLY bool "Make section mismatch errors non-fatal" diff --git a/scripts/Makefile.build b/scripts/Makefile.build index be32a3752de4..c6dfcc028f56 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -67,8 +67,6 @@ ifeq ($(CONFIG_MODULES)$(need-modorder),y1) modorder-target := $(obj)/modules.order endif -# We keep a list of all modules in $(MODVERDIR) - __build: $(if $(KBUILD_BUILTIN),$(builtin-target) $(lib-target) $(extra-y)) \ $(if $(KBUILD_MODULES),$(obj-m) $(modorder-target)) \ $(subdir-ym) $(always) @@ -87,11 +85,6 @@ ifneq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),) cmd_checkdoc = $(srctree)/scripts/kernel-doc -none $< endif -# Do section mismatch analysis for each module/built-in.a -ifdef CONFIG_DEBUG_SECTION_MISMATCH - cmd_secanalysis = ; scripts/mod/modpost $@ -endif - # Compile C sources (.c) # --------------------------------------------------------------------------- @@ -278,13 +271,11 @@ $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) -# Single-part modules are special since we need to mark them in $(MODVERDIR) - $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) @{ echo $(@:.o=.ko); echo $@; \ - $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod) + $(cmd_undef_syms); } > $(patsubst %.o,%.mod,$@) quiet_cmd_cc_lst_c = MKLST $@ cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \ @@ -461,12 +452,12 @@ endif # module is turned into a multi object module, $^ will contain header file # dependencies recorded in the .*.cmd file. quiet_cmd_link_multi-m = LD [M] $@ -cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(filter %.o,$^) $(cmd_secanalysis) + cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(filter %.o,$^) $(multi-used-m): FORCE $(call if_changed,link_multi-m) @{ echo $(@:.o=.ko); echo $(filter %.o,$^); \ - $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod) + $(cmd_undef_syms); } > $(patsubst %.o,%.mod,$@) $(call multi_depend, $(multi-used-m), .o, -objs -y -m) targets += $(multi-used-m) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 5841508ffca9..6b19c1a4eae5 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -6,8 +6,8 @@ # Stage one of module building created the following: # a) The individual .o files used for the module # b) A .o file which is the .o files above linked together -# c) A .mod file in $(MODVERDIR)/, listing the name of the -# the preliminary .o file, plus all .o files +# c) A .mod file, listing the name of the preliminary .o file, +# plus all .o files # d) modules.order, which lists all the modules # Stage 2 is handled by this file and does the following diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh index aab4e299d7a2..2e4a7320bfb4 100755 --- a/scripts/adjust_autoksyms.sh +++ b/scripts/adjust_autoksyms.sh @@ -8,8 +8,7 @@ # # Create/update the include/generated/autoksyms.h file from the list -# of all module's needed symbols as recorded on the third line of -# .tmp_versions/*.mod files. +# of all module's needed symbols as recorded on the third line of *.mod files. # # For each symbol being added or removed, the corresponding dependency # file's timestamp is updated to force a rebuild of the affected source @@ -47,13 +46,10 @@ cat > "$new_ksyms_file" << EOT */ EOT -[ "$(ls -A "$MODVERDIR")" ] && -for mod in "$MODVERDIR"/*.mod; do - sed -n -e '3{s/ /\n/g;/^$/!p;}' "$mod" -done | sort -u | -while read sym; do - echo "#define __KSYM_${sym} 1" -done >> "$new_ksyms_file" +sed 's/ko$/mod/' modules.order | +xargs -n1 sed -n -e '3{s/ /\n/g;/^$/!p;}' -- | +sort -u | +sed -e 's/\(.*\)/#define __KSYM_\1 1/' >> "$new_ksyms_file" # Special case for modversions (see modpost.c) if [ -n "$CONFIG_MODVERSIONS" ]; then diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index 0f6dcb4011a8..166f3fa247a9 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -396,21 +396,11 @@ void get_src_version(const char *modname, char sum[], unsigned sumlen) unsigned long len; struct md4_ctx md; char *sources, *end, *fname; - const char *basename; char filelist[PATH_MAX + 1]; - char *modverdir = getenv("MODVERDIR"); - if (!modverdir) - modverdir = "."; - - /* Source files for module are in .tmp_versions/modname.mod, - after the first line. */ - if (strrchr(modname, '/')) - basename = strrchr(modname, '/') + 1; - else - basename = modname; - snprintf(filelist, sizeof(filelist), "%s/%.*s.mod", modverdir, - (int) strlen(basename) - 2, basename); + /* objects for a module are listed in the second line of *.mod file. */ + snprintf(filelist, sizeof(filelist), "%.*smod", + (int)strlen(modname) - 1, modname); file = grab_file(filelist, &len); if (!file) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index 2d29df4a0a53..8640c278f1aa 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -29,7 +29,7 @@ fi PROVIDES="$PROVIDES kernel-$KERNELRELEASE" __KERNELRELEASE=$(echo $KERNELRELEASE | sed -e "s/-/_/g") -EXCLUDES="$RCS_TAR_IGNORE --exclude=.tmp_versions --exclude=*vmlinux* \ +EXCLUDES="$RCS_TAR_IGNORE --exclude=*vmlinux* --exclude=*.mod \ --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation \ --exclude=.config.old --exclude=.missing-syscalls.d --exclude=*.s" diff --git a/tools/power/cpupower/debug/kernel/Makefile b/tools/power/cpupower/debug/kernel/Makefile index c23e5a6ceb7e..7b5c43684be1 100644 --- a/tools/power/cpupower/debug/kernel/Makefile +++ b/tools/power/cpupower/debug/kernel/Makefile @@ -12,8 +12,8 @@ default: $(MAKE) -C $(KDIR) M=$(CURDIR) clean: - - rm -rf *.o *.ko .tmp-versions .*.cmd .*.mod.* *.mod.c - - rm -rf .tmp_versions* Module.symvers modules.order + - rm -rf *.o *.ko .*.cmd .*.mod.* *.mod.c + - rm -rf Module.symvers modules.order install: default install -d $(KMISC) -- cgit v1.2.3 From adb701d6cfa432f5dbdf28839b5e64291a7ed30b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Jul 2019 14:41:59 -0700 Subject: selftests: add a test case for rp_filter Add a test case to simulate the loopback packet case fixed in the previous patch. This test gets passed after the fix: IPv4 rp_filter tests TEST: rp_filter passes local packets [ OK ] TEST: rp_filter passes loopback packets [ OK ] Cc: David Ahern Signed-off-by: Cong Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_tests.sh | 35 +++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 9457aaeae092..4465fc2dae14 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,12 +9,13 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw" +TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" VERBOSE=0 PAUSE_ON_FAIL=no PAUSE=no IP="ip -netns ns1" +NS_EXEC="ip netns exec ns1" log_test() { @@ -433,6 +434,37 @@ fib_carrier_test() fib_carrier_unicast_test } +fib_rp_filter_test() +{ + echo + echo "IPv4 rp_filter tests" + + setup + + set -e + $IP link set dev lo address 52:54:00:6a:c7:5e + $IP link set dummy0 address 52:54:00:6a:c7:5e + $IP link add dummy1 type dummy + $IP link set dummy1 address 52:54:00:6a:c7:5e + $IP link set dev dummy1 up + $NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1 + $NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1 + $NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1 + + $NS_EXEC tc qd add dev dummy1 parent root handle 1: fq_codel + $NS_EXEC tc filter add dev dummy1 parent 1: protocol arp basic action mirred egress redirect dev lo + $NS_EXEC tc filter add dev dummy1 parent 1: protocol ip basic action mirred egress redirect dev lo + set +e + + run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 198.51.100.1" + log_test $? 0 "rp_filter passes local packets" + + run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 127.0.0.1" + log_test $? 0 "rp_filter passes loopback packets" + + cleanup +} + ################################################################################ # Tests on nexthop spec @@ -1557,6 +1589,7 @@ do fib_unreg_test|unregister) fib_unreg_test;; fib_down_test|down) fib_down_test;; fib_carrier_test|carrier) fib_carrier_test;; + fib_rp_filter_test|rp_filter) fib_rp_filter_test;; fib_nexthop_test|nexthop) fib_nexthop_test;; ipv6_route_test|ipv6_rt) ipv6_route_test;; ipv4_route_test|ipv4_rt) ipv4_route_test;; -- cgit v1.2.3 From 3c3ea5031761fdd144b461d23a077c3a0cf427fa Mon Sep 17 00:00:00 2001 From: Michael Forney Date: Wed, 10 Jul 2019 16:17:35 -0500 Subject: objtool: Use Elf_Scn typedef instead of assuming struct name The libelf implementation might use a different struct name, and the Elf_Scn typedef is already used throughout the rest of objtool. Signed-off-by: Michael Forney Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/d270e1be2835fc2a10acf67535ff2ebd2145bf43.1562793448.git.jpoimboe@redhat.com --- tools/objtool/elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index e99e1be19ad9..76e4f7ceab82 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -463,7 +463,7 @@ struct section *elf_create_section(struct elf *elf, const char *name, { struct section *sec, *shstrtab; size_t size = entsize * nr; - struct Elf_Scn *s; + Elf_Scn *s; Elf_Data *data; sec = malloc(sizeof(*sec)); -- cgit v1.2.3 From 8e144797f1a67c52e386161863da4614a23ad913 Mon Sep 17 00:00:00 2001 From: Michael Forney Date: Wed, 10 Jul 2019 16:20:11 -0500 Subject: objtool: Rename elf_open() to prevent conflict with libelf from elftoolchain The elftoolchain version of libelf has a function named elf_open(). The function name isn't quite accurate anyway, since it also reads all the ELF data. Rename it to elf_read(), which is more accurate. [ jpoimboe: rename to elf_read(); write commit description ] Signed-off-by: Michael Forney Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/7ce2d1b35665edf19fd0eb6fbc0b17b81a48e62f.1562793604.git.jpoimboe@redhat.com --- tools/objtool/check.c | 2 +- tools/objtool/elf.c | 2 +- tools/objtool/elf.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 172f99195726..de8f40730b37 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2407,7 +2407,7 @@ int check(const char *_objname, bool orc) objname = _objname; - file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY); + file.elf = elf_read(objname, orc ? O_RDWR : O_RDONLY); if (!file.elf) return 1; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 76e4f7ceab82..e18698262837 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -401,7 +401,7 @@ static int read_relas(struct elf *elf) return 0; } -struct elf *elf_open(const char *name, int flags) +struct elf *elf_read(const char *name, int flags) { struct elf *elf; Elf_Cmd cmd; diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index e44ca5d51871..2fe0b0aa741d 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -74,7 +74,7 @@ struct elf { }; -struct elf *elf_open(const char *name, int flags); +struct elf *elf_read(const char *name, int flags); struct section *find_section_by_name(struct elf *elf, const char *name); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(struct elf *elf, const char *name); -- cgit v1.2.3 From a7e47f26039c26312a4144c3001b4e9fa886bd45 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:46 -0500 Subject: objtool: Add mcsafe_handle_tail() to the uaccess safe list After an objtool improvement, it's reporting that __memcpy_mcsafe() is calling mcsafe_handle_tail() with AC=1: arch/x86/lib/memcpy_64.o: warning: objtool: .fixup+0x13: call to mcsafe_handle_tail() with UACCESS enabled arch/x86/lib/memcpy_64.o: warning: objtool: __memcpy_mcsafe()+0x34: (alt) arch/x86/lib/memcpy_64.o: warning: objtool: __memcpy_mcsafe()+0xb: (branch) arch/x86/lib/memcpy_64.o: warning: objtool: __memcpy_mcsafe()+0x0: <=== (func) mcsafe_handle_tail() is basically an extension of __memcpy_mcsafe(), so AC=1 is supposed to be set. Add mcsafe_handle_tail() to the uaccess safe list. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/035c38f7eac845281d3c3d36749144982e06e58c.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2f8ba0368231..f9494ff8c286 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -490,6 +490,7 @@ static const char *uaccess_safe_builtin[] = { /* misc */ "csum_partial_copy_generic", "__memcpy_mcsafe", + "mcsafe_handle_tail", "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ NULL }; -- cgit v1.2.3 From c705cecc8431951b4f34178e6b1db51b4a504c43 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:47 -0500 Subject: objtool: Track original function across branches If 'insn->func' is NULL, objtool skips some important checks, including sibling call validation. So if some .fixup code does an invalid sibling call, objtool ignores it. Treat all code branches (including alts) as part of the original function by keeping track of the original func value from validate_functions(). This improves the usefulness of some clang function fallthrough warnings, and exposes some additional kernel bugs in the process. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/505df630f33c9717e1ccde6e4b64c5303135c25f.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f9494ff8c286..d8a1ce80fded 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1934,13 +1934,12 @@ static int validate_sibling_call(struct instruction *insn, struct insn_state *st * each instruction and validate all the rules described in * tools/objtool/Documentation/stack-validation.txt. */ -static int validate_branch(struct objtool_file *file, struct instruction *first, - struct insn_state state) +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *first, struct insn_state state) { struct alternative *alt; struct instruction *insn, *next_insn; struct section *sec; - struct symbol *func = NULL; int ret; insn = first; @@ -1961,9 +1960,6 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, return 1; } - if (insn->func) - func = insn->func->pfunc; - if (func && insn->ignore) { WARN_FUNC("BUG: why am I validating an ignored function?", sec, insn->offset); @@ -1985,7 +1981,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, i = insn; save_insn = NULL; - func_for_each_insn_continue_reverse(file, insn->func, i) { + func_for_each_insn_continue_reverse(file, func, i) { if (i->save) { save_insn = i; break; @@ -2031,7 +2027,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, if (alt->skip_orig) skip_orig = true; - ret = validate_branch(file, alt->insn, state); + ret = validate_branch(file, func, alt->insn, state); if (ret) { if (backtrace) BT_FUNC("(alt)", insn); @@ -2069,7 +2065,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, if (state.bp_scratch) { WARN("%s uses BP as a scratch register", - insn->func->name); + func->name); return 1; } @@ -2109,8 +2105,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, } else if (insn->jump_dest && (!func || !insn->jump_dest->func || insn->jump_dest->func->pfunc == func)) { - ret = validate_branch(file, insn->jump_dest, - state); + ret = validate_branch(file, func, + insn->jump_dest, state); if (ret) { if (backtrace) BT_FUNC("(branch)", insn); @@ -2176,7 +2172,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, break; case INSN_CLAC: - if (!state.uaccess && insn->func) { + if (!state.uaccess && func) { WARN_FUNC("redundant UACCESS disable", sec, insn->offset); return 1; } @@ -2197,7 +2193,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, break; case INSN_CLD: - if (!state.df && insn->func) + if (!state.df && func) WARN_FUNC("redundant CLD", sec, insn->offset); state.df = false; @@ -2236,7 +2232,7 @@ static int validate_unwind_hints(struct objtool_file *file) for_each_insn(file, insn) { if (insn->hint && !insn->visited) { - ret = validate_branch(file, insn, state); + ret = validate_branch(file, insn->func, insn, state); if (ret && backtrace) BT_FUNC("<=== (hint)", insn); warnings += ret; @@ -2363,12 +2359,12 @@ static int validate_functions(struct objtool_file *file) continue; insn = find_insn(file, sec, func->offset); - if (!insn || insn->ignore) + if (!insn || insn->ignore || insn->visited) continue; state.uaccess = func->alias->uaccess_safe; - ret = validate_branch(file, insn, state); + ret = validate_branch(file, func, insn, state); if (ret && backtrace) BT_FUNC("<=== (func)", insn); warnings += ret; -- cgit v1.2.3 From e10cd8fe8ddfd28a172d2be57ae0e90c7f752e6a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:48 -0500 Subject: objtool: Refactor function alias logic - Add an alias check in validate_functions(). With this change, aliases no longer need uaccess_safe set. - Add an alias check in decode_instructions(). With this change, the "if (!insn->func)" check is no longer needed. - Don't create aliases for zero-length functions, as it can have unexpected results. The next patch will spit out a warning for zero-length functions anyway. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/26a99c31426540f19c9a58b9e10727c385a147bc.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 16 +++++++++------- tools/objtool/elf.c | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d8a1ce80fded..3f8664b0e3f9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -276,7 +276,7 @@ static int decode_instructions(struct objtool_file *file) } list_for_each_entry(func, &sec->symbol_list, list) { - if (func->type != STT_FUNC) + if (func->type != STT_FUNC || func->alias != func) continue; if (!find_insn(file, sec, func->offset)) { @@ -286,8 +286,7 @@ static int decode_instructions(struct objtool_file *file) } func_for_each_insn(file, func, insn) - if (!insn->func) - insn->func = func; + insn->func = func; } } @@ -508,7 +507,7 @@ static void add_uaccess_safe(struct objtool_file *file) if (!func) continue; - func->alias->uaccess_safe = true; + func->uaccess_safe = true; } } @@ -1887,7 +1886,7 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state) static inline bool func_uaccess_safe(struct symbol *func) { if (func) - return func->alias->uaccess_safe; + return func->uaccess_safe; return false; } @@ -2355,14 +2354,17 @@ static int validate_functions(struct objtool_file *file) for_each_sec(file, sec) { list_for_each_entry(func, &sec->symbol_list, list) { - if (func->type != STT_FUNC || func->pfunc != func) + if (func->type != STT_FUNC) + continue; + + if (func->pfunc != func || func->alias != func) continue; insn = find_insn(file, sec, func->offset); if (!insn || insn->ignore || insn->visited) continue; - state.uaccess = func->alias->uaccess_safe; + state.uaccess = func->uaccess_safe; ret = validate_branch(file, func, insn, state); if (ret && backtrace) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index e18698262837..9194732a673d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -278,7 +278,7 @@ static int read_symbols(struct elf *elf) } if (sym->offset == s->offset) { - if (sym->len == s->len && alias == sym) + if (sym->len && sym->len == s->len && alias == sym) alias = s; if (sym->len >= s->len) { -- cgit v1.2.3 From 61e9b75a0ccf1fecacc28a2d77ea4a19aa404e39 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:49 -0500 Subject: objtool: Warn on zero-length functions All callable functions should have an ELF size. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/03d429c4fa87829c61c5dc0e89652f4d9efb62f1.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f8664b0e3f9..dece3253ff6a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2357,6 +2357,12 @@ static int validate_functions(struct objtool_file *file) if (func->type != STT_FUNC) continue; + if (!func->len) { + WARN("%s() is missing an ELF size annotation", + func->name); + warnings++; + } + if (func->pfunc != func || func->alias != func) continue; -- cgit v1.2.3 From 8e25c9f8b482ea8d8b6fb4f6f5c09bcc5ee18663 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:50 -0500 Subject: objtool: Change dead_end_function() to return boolean dead_end_function() can no longer return an error. Simplify its interface by making it return boolean. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/9e6679610768fb6e6c51dca23f7d4d0c03b0c910.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index dece3253ff6a..d9d1c9b54947 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -105,14 +105,9 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, * * For local functions, we have to detect them manually by simply looking for * the lack of a return instruction. - * - * Returns: - * -1: error - * 0: no dead end - * 1: dead end */ -static int __dead_end_function(struct objtool_file *file, struct symbol *func, - int recursion) +static bool __dead_end_function(struct objtool_file *file, struct symbol *func, + int recursion) { int i; struct instruction *insn; @@ -139,29 +134,29 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, }; if (func->bind == STB_WEAK) - return 0; + return false; if (func->bind == STB_GLOBAL) for (i = 0; i < ARRAY_SIZE(global_noreturns); i++) if (!strcmp(func->name, global_noreturns[i])) - return 1; + return true; if (!func->len) - return 0; + return false; insn = find_insn(file, func->sec, func->offset); if (!insn->func) - return 0; + return false; func_for_each_insn_all(file, func, insn) { empty = false; if (insn->type == INSN_RETURN) - return 0; + return false; } if (empty) - return 0; + return false; /* * A function can have a sibling call instead of a return. In that @@ -174,7 +169,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, if (!dest) /* sibling call to another file */ - return 0; + return false; if (dest->func && dest->func->pfunc != insn->func->pfunc) { @@ -186,7 +181,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, * This is a very rare case. It means * they aren't dead ends. */ - return 0; + return false; } return __dead_end_function(file, dest->func, @@ -196,13 +191,13 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, if (insn->type == INSN_JUMP_DYNAMIC && list_empty(&insn->alts)) /* sibling call */ - return 0; + return false; } - return 1; + return true; } -static int dead_end_function(struct objtool_file *file, struct symbol *func) +static bool dead_end_function(struct objtool_file *file, struct symbol *func) { return __dead_end_function(file, func, 0); } @@ -2080,11 +2075,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (is_fentry_call(insn)) break; - ret = dead_end_function(file, insn->call_dest); - if (ret == 1) + if (dead_end_function(file, insn->call_dest)) return 0; - if (ret == -1) - return 1; } if (!no_fp && func && !has_valid_stack_frame(&state)) { -- cgit v1.2.3 From c9bab22bc449ad2496a6bbbf68acc711d9c5301c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:51 -0500 Subject: objtool: Do frame pointer check before dead end check Even calls to __noreturn functions need the frame pointer setup first. Such functions often dump the stack. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/aed62fbd60e239280218be623f751a433658e896.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d9d1c9b54947..0d2a8e54a82e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -133,6 +133,9 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "rewind_stack_do_exit", }; + if (!func) + return false; + if (func->bind == STB_WEAK) return false; @@ -2071,19 +2074,16 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (ret) return ret; - if (insn->type == INSN_CALL) { - if (is_fentry_call(insn)) - break; - - if (dead_end_function(file, insn->call_dest)) - return 0; - } - - if (!no_fp && func && !has_valid_stack_frame(&state)) { + if (!no_fp && func && !is_fentry_call(insn) && + !has_valid_stack_frame(&state)) { WARN_FUNC("call without frame pointer save/setup", sec, insn->offset); return 1; } + + if (dead_end_function(file, insn->call_dest)) + return 0; + break; case INSN_JUMP_CONDITIONAL: -- cgit v1.2.3 From 0c1ddd33177530feb3685a800bba1ac4cc58cc4b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:52 -0500 Subject: objtool: Refactor sibling call detection logic Simplify the sibling call detection logic a bit. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/8357dbef9e7f5512e76bf83a76c81722fc09eb5e.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 65 ++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0d2a8e54a82e..7fe31e0f8afe 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -97,6 +97,20 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) +static bool is_sibling_call(struct instruction *insn) +{ + /* An indirect jump is either a sibling call or a jump to a table. */ + if (insn->type == INSN_JUMP_DYNAMIC) + return list_empty(&insn->alts); + + if (insn->type != INSN_JUMP_CONDITIONAL && + insn->type != INSN_JUMP_UNCONDITIONAL) + return false; + + /* add_jump_destinations() sets insn->call_dest for sibling calls. */ + return !!insn->call_dest; +} + /* * This checks to see if the given function is a "noreturn" function. * @@ -167,34 +181,25 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, * of the sibling call returns. */ func_for_each_insn_all(file, func, insn) { - if (insn->type == INSN_JUMP_UNCONDITIONAL) { + if (is_sibling_call(insn)) { struct instruction *dest = insn->jump_dest; if (!dest) /* sibling call to another file */ return false; - if (dest->func && dest->func->pfunc != insn->func->pfunc) { - - /* local sibling call */ - if (recursion == 5) { - /* - * Infinite recursion: two functions - * have sibling calls to each other. - * This is a very rare case. It means - * they aren't dead ends. - */ - return false; - } - - return __dead_end_function(file, dest->func, - recursion + 1); + /* local sibling call */ + if (recursion == 5) { + /* + * Infinite recursion: two functions have + * sibling calls to each other. This is a very + * rare case. It means they aren't dead ends. + */ + return false; } - } - if (insn->type == INSN_JUMP_DYNAMIC && list_empty(&insn->alts)) - /* sibling call */ - return false; + return __dead_end_function(file, dest->func, recursion+1); + } } return true; @@ -581,9 +586,8 @@ static int add_jump_destinations(struct objtool_file *file) insn->retpoline_safe = true; continue; } else { - /* sibling call */ + /* external sibling call */ insn->call_dest = rela->sym; - insn->jump_dest = NULL; continue; } @@ -633,9 +637,8 @@ static int add_jump_destinations(struct objtool_file *file) } else if (insn->jump_dest->func->pfunc != insn->func->pfunc && insn->jump_dest->offset == insn->jump_dest->func->offset) { - /* sibling class */ + /* internal sibling call */ insn->call_dest = insn->jump_dest->func; - insn->jump_dest = NULL; } } } @@ -1889,7 +1892,7 @@ static inline bool func_uaccess_safe(struct symbol *func) return false; } -static inline const char *insn_dest_name(struct instruction *insn) +static inline const char *call_dest_name(struct instruction *insn) { if (insn->call_dest) return insn->call_dest->name; @@ -1901,13 +1904,13 @@ static int validate_call(struct instruction *insn, struct insn_state *state) { if (state->uaccess && !func_uaccess_safe(insn->call_dest)) { WARN_FUNC("call to %s() with UACCESS enabled", - insn->sec, insn->offset, insn_dest_name(insn)); + insn->sec, insn->offset, call_dest_name(insn)); return 1; } if (state->df) { WARN_FUNC("call to %s() with DF set", - insn->sec, insn->offset, insn_dest_name(insn)); + insn->sec, insn->offset, call_dest_name(insn)); return 1; } @@ -2088,14 +2091,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_CONDITIONAL: case INSN_JUMP_UNCONDITIONAL: - if (func && !insn->jump_dest) { + if (func && is_sibling_call(insn)) { ret = validate_sibling_call(insn, &state); if (ret) return ret; - } else if (insn->jump_dest && - (!func || !insn->jump_dest->func || - insn->jump_dest->func->pfunc == func)) { + } else if (insn->jump_dest) { ret = validate_branch(file, func, insn->jump_dest, state); if (ret) { @@ -2111,7 +2112,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_JUMP_DYNAMIC: - if (func && list_empty(&insn->alts)) { + if (func && is_sibling_call(insn)) { ret = validate_sibling_call(insn, &state); if (ret) return ret; -- cgit v1.2.3 From e7c2bc37bfae120bce3e7cc8c8abf9d110af0757 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:53 -0500 Subject: objtool: Refactor jump table code Now that C jump tables are supported, call them "jump tables" instead of "switch tables". Also rename some other variables, add comments, and simplify the code flow a bit. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cf951b0c0641628e0b9b81f7ceccd9bcabcb4bd8.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 82 +++++++++++++++++++++++++++------------------------ tools/objtool/elf.c | 2 +- tools/objtool/elf.h | 2 +- 3 files changed, 46 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7fe31e0f8afe..4525cf677a1b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -627,7 +627,7 @@ static int add_jump_destinations(struct objtool_file *file) * However this code can't completely replace the * read_symbols() code because this doesn't detect the * case where the parent function's only reference to a - * subfunction is through a switch table. + * subfunction is through a jump table. */ if (!strstr(insn->func->name, ".cold.") && strstr(insn->jump_dest->func->name, ".cold.")) { @@ -899,20 +899,24 @@ out: return ret; } -static int add_switch_table(struct objtool_file *file, struct instruction *insn, +static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct rela *table, struct rela *next_table) { struct rela *rela = table; - struct instruction *alt_insn; + struct instruction *dest_insn; struct alternative *alt; struct symbol *pfunc = insn->func->pfunc; unsigned int prev_offset = 0; - list_for_each_entry_from(rela, &table->rela_sec->rela_list, list) { + /* + * Each @rela is a switch table relocation which points to the target + * instruction. + */ + list_for_each_entry_from(rela, &table->sec->rela_list, list) { if (rela == next_table) break; - /* Make sure the switch table entries are consecutive: */ + /* Make sure the table entries are consecutive: */ if (prev_offset && rela->offset != prev_offset + 8) break; @@ -921,12 +925,12 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, rela->addend == pfunc->offset) break; - alt_insn = find_insn(file, rela->sym->sec, rela->addend); - if (!alt_insn) + dest_insn = find_insn(file, rela->sym->sec, rela->addend); + if (!dest_insn) break; - /* Make sure the jmp dest is in the function or subfunction: */ - if (alt_insn->func->pfunc != pfunc) + /* Make sure the destination is in the same function: */ + if (dest_insn->func->pfunc != pfunc) break; alt = malloc(sizeof(*alt)); @@ -935,7 +939,7 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, return -1; } - alt->insn = alt_insn; + alt->insn = dest_insn; list_add_tail(&alt->list, &insn->alts); prev_offset = rela->offset; } @@ -950,7 +954,7 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, } /* - * find_switch_table() - Given a dynamic jump, find the switch jump table in + * find_jump_table() - Given a dynamic jump, find the switch jump table in * .rodata associated with it. * * There are 3 basic patterns: @@ -992,13 +996,13 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, * * NOTE: RETPOLINE made it harder still to decode dynamic jumps. */ -static struct rela *find_switch_table(struct objtool_file *file, +static struct rela *find_jump_table(struct objtool_file *file, struct symbol *func, struct instruction *insn) { - struct rela *text_rela, *rodata_rela; + struct rela *text_rela, *table_rela; struct instruction *orig_insn = insn; - struct section *rodata_sec; + struct section *table_sec; unsigned long table_offset; /* @@ -1031,7 +1035,7 @@ static struct rela *find_switch_table(struct objtool_file *file, continue; table_offset = text_rela->addend; - rodata_sec = text_rela->sym->sec; + table_sec = text_rela->sym->sec; if (text_rela->type == R_X86_64_PC32) table_offset += 4; @@ -1045,29 +1049,31 @@ static struct rela *find_switch_table(struct objtool_file *file, * need to be placed in the C_JUMP_TABLE_SECTION section. They * have symbols associated with them. */ - if (find_symbol_containing(rodata_sec, table_offset) && - strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION)) + if (find_symbol_containing(table_sec, table_offset) && + strcmp(table_sec->name, C_JUMP_TABLE_SECTION)) continue; - rodata_rela = find_rela_by_dest(rodata_sec, table_offset); - if (rodata_rela) { - /* - * Use of RIP-relative switch jumps is quite rare, and - * indicates a rare GCC quirk/bug which can leave dead - * code behind. - */ - if (text_rela->type == R_X86_64_PC32) - file->ignore_unreachables = true; + /* Each table entry has a rela associated with it. */ + table_rela = find_rela_by_dest(table_sec, table_offset); + if (!table_rela) + continue; - return rodata_rela; - } + /* + * Use of RIP-relative switch jumps is quite rare, and + * indicates a rare GCC quirk/bug which can leave dead code + * behind. + */ + if (text_rela->type == R_X86_64_PC32) + file->ignore_unreachables = true; + + return table_rela; } return NULL; } -static int add_func_switch_tables(struct objtool_file *file, +static int add_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *last = NULL, *prev_jump = NULL; @@ -1080,7 +1086,7 @@ static int add_func_switch_tables(struct objtool_file *file, /* * Store back-pointers for unconditional forward jumps such - * that find_switch_table() can back-track using those and + * that find_jump_table() can back-track using those and * avoid some potentially confusing code. */ if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest && @@ -1095,17 +1101,17 @@ static int add_func_switch_tables(struct objtool_file *file, if (insn->type != INSN_JUMP_DYNAMIC) continue; - rela = find_switch_table(file, func, insn); + rela = find_jump_table(file, func, insn); if (!rela) continue; /* - * We found a switch table, but we don't know yet how big it + * We found a jump table, but we don't know yet how big it * is. Don't add it until we reach the end of the function or - * the beginning of another switch table in the same function. + * the beginning of another jump table in the same function. */ if (prev_jump) { - ret = add_switch_table(file, prev_jump, prev_rela, rela); + ret = add_jump_table(file, prev_jump, prev_rela, rela); if (ret) return ret; } @@ -1115,7 +1121,7 @@ static int add_func_switch_tables(struct objtool_file *file, } if (prev_jump) { - ret = add_switch_table(file, prev_jump, prev_rela, NULL); + ret = add_jump_table(file, prev_jump, prev_rela, NULL); if (ret) return ret; } @@ -1128,7 +1134,7 @@ static int add_func_switch_tables(struct objtool_file *file, * section which contains a list of addresses within the function to jump to. * This finds these jump tables and adds them to the insn->alts lists. */ -static int add_switch_table_alts(struct objtool_file *file) +static int add_jump_table_alts(struct objtool_file *file) { struct section *sec; struct symbol *func; @@ -1142,7 +1148,7 @@ static int add_switch_table_alts(struct objtool_file *file) if (func->type != STT_FUNC) continue; - ret = add_func_switch_tables(file, func); + ret = add_func_jump_tables(file, func); if (ret) return ret; } @@ -1339,7 +1345,7 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - ret = add_switch_table_alts(file); + ret = add_jump_table_alts(file); if (ret) return ret; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 9194732a673d..edba4745f25a 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -385,7 +385,7 @@ static int read_relas(struct elf *elf) rela->offset = rela->rela.r_offset; symndx = GELF_R_SYM(rela->rela.r_info); rela->sym = find_symbol_by_index(elf, symndx); - rela->rela_sec = sec; + rela->sec = sec; if (!rela->sym) { WARN("can't find rela entry symbol %d for %s", symndx, sec->name); diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index 2fe0b0aa741d..d4d3e0528d4a 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -57,7 +57,7 @@ struct rela { struct list_head list; struct hlist_node hash; GElf_Rela rela; - struct section *rela_sec; + struct section *sec; struct symbol *sym; unsigned int type; unsigned long offset; -- cgit v1.2.3 From bd98c81346468fc2f86aeeb44d4d0d6f763a62b7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 17 Jul 2019 20:36:54 -0500 Subject: objtool: Support repeated uses of the same C jump table This fixes objtool for both a GCC issue and a Clang issue: 1) GCC issue: kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8d5: sibling call from callable instruction with modified stack frame With CONFIG_RETPOLINE=n, GCC is doing the following optimization in ___bpf_prog_run(). Before: select_insn: jmp *jumptable(,%rax,8) ... ALU64_ADD_X: ... jmp select_insn ALU_ADD_X: ... jmp select_insn After: select_insn: jmp *jumptable(, %rax, 8) ... ALU64_ADD_X: ... jmp *jumptable(, %rax, 8) ALU_ADD_X: ... jmp *jumptable(, %rax, 8) This confuses objtool. It has never seen multiple indirect jump sites which use the same jump table. For GCC switch tables, the only way of detecting the size of a table is by continuing to scan for more tables. The size of the previous table can only be determined after another switch table is found, or when the scan reaches the end of the function. That logic was reused for C jump tables, and was based on the assumption that each jump table only has a single jump site. The above optimization breaks that assumption. 2) Clang issue: drivers/usb/misc/sisusbvga/sisusb.o: warning: objtool: sisusb_write_mem_bulk()+0x588: can't find switch jump table With clang 9, code can be generated where a function contains two indirect jump instructions which use the same switch table. The fix is the same for both issues: split the jump table parsing into two passes. In the first pass, locate the heads of all switch tables for the function and mark their locations. In the second pass, parse the switch tables and add them. Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code") Reported-by: Randy Dunlap Reported-by: Arnd Bergmann Signed-off-by: Jann Horn Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/e995befaada9d4d8b2cf788ff3f566ba900d2b4d.1563413318.git.jpoimboe@redhat.com Co-developed-by: Josh Poimboeuf --- tools/objtool/check.c | 53 +++++++++++++++++++++++++++------------------------ tools/objtool/check.h | 1 + tools/objtool/elf.h | 1 + 3 files changed, 30 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4525cf677a1b..66f7c01385a4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -900,7 +900,7 @@ out: } static int add_jump_table(struct objtool_file *file, struct instruction *insn, - struct rela *table, struct rela *next_table) + struct rela *table) { struct rela *rela = table; struct instruction *dest_insn; @@ -913,7 +913,9 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * instruction. */ list_for_each_entry_from(rela, &table->sec->rela_list, list) { - if (rela == next_table) + + /* Check for the end of the table: */ + if (rela != table && rela->jump_table_start) break; /* Make sure the table entries are consecutive: */ @@ -1072,13 +1074,15 @@ static struct rela *find_jump_table(struct objtool_file *file, return NULL; } - -static int add_func_jump_tables(struct objtool_file *file, - struct symbol *func) +/* + * First pass: Mark the head of each jump table so that in the next pass, + * we know when a given jump table ends and the next one starts. + */ +static void mark_func_jump_tables(struct objtool_file *file, + struct symbol *func) { - struct instruction *insn, *last = NULL, *prev_jump = NULL; - struct rela *rela, *prev_rela = NULL; - int ret; + struct instruction *insn, *last = NULL; + struct rela *rela; func_for_each_insn_all(file, func, insn) { if (!last) @@ -1102,26 +1106,24 @@ static int add_func_jump_tables(struct objtool_file *file, continue; rela = find_jump_table(file, func, insn); - if (!rela) - continue; - - /* - * We found a jump table, but we don't know yet how big it - * is. Don't add it until we reach the end of the function or - * the beginning of another jump table in the same function. - */ - if (prev_jump) { - ret = add_jump_table(file, prev_jump, prev_rela, rela); - if (ret) - return ret; + if (rela) { + rela->jump_table_start = true; + insn->jump_table = rela; } - - prev_jump = insn; - prev_rela = rela; } +} + +static int add_func_jump_tables(struct objtool_file *file, + struct symbol *func) +{ + struct instruction *insn; + int ret; + + func_for_each_insn_all(file, func, insn) { + if (!insn->jump_table) + continue; - if (prev_jump) { - ret = add_jump_table(file, prev_jump, prev_rela, NULL); + ret = add_jump_table(file, insn, insn->jump_table); if (ret) return ret; } @@ -1148,6 +1150,7 @@ static int add_jump_table_alts(struct objtool_file *file) if (func->type != STT_FUNC) continue; + mark_func_jump_tables(file, func); ret = add_func_jump_tables(file, func); if (ret) return ret; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index cb60b9acf5cf..afa6a79e0715 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -38,6 +38,7 @@ struct instruction { struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; + struct rela *jump_table; struct list_head alts; struct symbol *func; struct stack_op stack_op; diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index d4d3e0528d4a..44150204db4d 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -62,6 +62,7 @@ struct rela { unsigned int type; unsigned long offset; int addend; + bool jump_table_start; }; struct elf { -- cgit v1.2.3 From e65050b94d8c518fdbee572ea4ca6d352e1fda37 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:55 -0500 Subject: objtool: Fix seg fault on bad switch table entry In one rare case, Clang generated the following code: 5ca: 83 e0 21 and $0x21,%eax 5cd: b9 04 00 00 00 mov $0x4,%ecx 5d2: ff 24 c5 00 00 00 00 jmpq *0x0(,%rax,8) 5d5: R_X86_64_32S .rodata+0x38 which uses the corresponding jump table relocations: 000000000038 000200000001 R_X86_64_64 0000000000000000 .text + 834 000000000040 000200000001 R_X86_64_64 0000000000000000 .text + 5d9 000000000048 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000050 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000058 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000060 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000068 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000070 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000078 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000080 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000088 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000090 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000098 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000a0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000a8 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000b0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000b8 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000c0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000c8 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000d0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000d8 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000e0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000e8 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000f0 000200000001 R_X86_64_64 0000000000000000 .text + b96 0000000000f8 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000100 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000108 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000110 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000118 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000120 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000128 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000130 000200000001 R_X86_64_64 0000000000000000 .text + b96 000000000138 000200000001 R_X86_64_64 0000000000000000 .text + 82f 000000000140 000200000001 R_X86_64_64 0000000000000000 .text + 828 Since %eax was masked with 0x21, only the first two and the last two entries are possible. Objtool doesn't actually emulate all the code, so it isn't smart enough to know that all the middle entries aren't reachable. They point to the NOP padding area after the end of the function, so objtool seg faulted when it tried to dereference a NULL insn->func. After this fix, objtool still gives an "unreachable" error because it stops reading the jump table when it encounters the bad addresses: /home/jpoimboe/objtool-tests/adm1275.o: warning: objtool: adm1275_probe()+0x828: unreachable instruction While the above code is technically correct, it's very wasteful of memory -- it uses 34 jump table entries when only 4 are needed. It's also not possible for objtool to validate this type of switch table because the unused entries point outside the function and objtool has no way of determining if that's intentional. Hopefully the Clang folks can fix it. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/a9db88eec4f1ca089e040989846961748238b6d8.1563413318.git.jpoimboe@redhat.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 66f7c01385a4..a966b22a32ef 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -932,7 +932,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, break; /* Make sure the destination is in the same function: */ - if (dest_insn->func->pfunc != pfunc) + if (!dest_insn->func || dest_insn->func->pfunc != pfunc) break; alt = malloc(sizeof(*alt)); -- cgit v1.2.3 From 9fe7b7642fe2c5158904d06fe31b740ca0695a01 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:56 -0500 Subject: objtool: Convert insn type to enum This makes it easier to add new instruction types. Also it's hopefully more robust since the compiler should warn about out-of-range enums. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/0740e96af0d40e54cfd6a07bf09db0fbd10793cd.1563413318.git.jpoimboe@redhat.com --- tools/objtool/arch.h | 35 ++++++++++++++++++----------------- tools/objtool/arch/x86/decode.c | 2 +- tools/objtool/check.c | 7 ------- tools/objtool/check.h | 2 +- 4 files changed, 20 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index 580e344db3dd..50448c0c4bca 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -11,22 +11,23 @@ #include "elf.h" #include "cfi.h" -#define INSN_JUMP_CONDITIONAL 1 -#define INSN_JUMP_UNCONDITIONAL 2 -#define INSN_JUMP_DYNAMIC 3 -#define INSN_CALL 4 -#define INSN_CALL_DYNAMIC 5 -#define INSN_RETURN 6 -#define INSN_CONTEXT_SWITCH 7 -#define INSN_STACK 8 -#define INSN_BUG 9 -#define INSN_NOP 10 -#define INSN_STAC 11 -#define INSN_CLAC 12 -#define INSN_STD 13 -#define INSN_CLD 14 -#define INSN_OTHER 15 -#define INSN_LAST INSN_OTHER +enum insn_type { + INSN_JUMP_CONDITIONAL, + INSN_JUMP_UNCONDITIONAL, + INSN_JUMP_DYNAMIC, + INSN_CALL, + INSN_CALL_DYNAMIC, + INSN_RETURN, + INSN_CONTEXT_SWITCH, + INSN_STACK, + INSN_BUG, + INSN_NOP, + INSN_STAC, + INSN_CLAC, + INSN_STD, + INSN_CLD, + INSN_OTHER, +}; enum op_dest_type { OP_DEST_REG, @@ -68,7 +69,7 @@ void arch_initial_func_cfi_state(struct cfi_state *state); int arch_decode_instruction(struct elf *elf, struct section *sec, unsigned long offset, unsigned int maxlen, - unsigned int *len, unsigned char *type, + unsigned int *len, enum insn_type *type, unsigned long *immediate, struct stack_op *op); bool arch_callee_saved_reg(unsigned char reg); diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 584568f27a83..0567c47a91b1 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -68,7 +68,7 @@ bool arch_callee_saved_reg(unsigned char reg) int arch_decode_instruction(struct elf *elf, struct section *sec, unsigned long offset, unsigned int maxlen, - unsigned int *len, unsigned char *type, + unsigned int *len, enum insn_type *type, unsigned long *immediate, struct stack_op *op) { struct insn insn; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a966b22a32ef..04572a049cfc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -267,13 +267,6 @@ static int decode_instructions(struct objtool_file *file) if (ret) goto err; - if (!insn->type || insn->type > INSN_LAST) { - WARN_FUNC("invalid instruction type %d", - insn->sec, insn->offset, insn->type); - ret = -1; - goto err; - } - hash_add(file->insn_hash, &insn->hash, insn->offset); list_add_tail(&insn->list, &file->insn_list); } diff --git a/tools/objtool/check.h b/tools/objtool/check.h index afa6a79e0715..b881fafcf55d 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -31,7 +31,7 @@ struct instruction { struct section *sec; unsigned long offset; unsigned int len; - unsigned char type; + enum insn_type type; unsigned long immediate; bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts; bool retpoline_safe; -- cgit v1.2.3 From b68b9907069a8d3a65bc16a35360bf8f8603c8fa Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:57 -0500 Subject: objtool: Support conditional retpolines A Clang-built kernel is showing the following warning: arch/x86/kernel/platform-quirks.o: warning: objtool: x86_early_init_platform_quirks()+0x84: unreachable instruction That corresponds to this code: 7e: 0f 85 00 00 00 00 jne 84 80: R_X86_64_PC32 __x86_indirect_thunk_r11-0x4 84: c3 retq This is a conditional retpoline sibling call, which is now possible thanks to retpolines. Objtool hasn't seen that before. It's incorrectly interpreting the conditional jump as an unconditional dynamic jump. Reported-by: Nick Desaulniers Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/30d4c758b267ef487fb97e6ecb2f148ad007b554.1563413318.git.jpoimboe@redhat.com --- tools/objtool/arch.h | 1 + tools/objtool/check.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index 50448c0c4bca..ced3765c4f44 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -15,6 +15,7 @@ enum insn_type { INSN_JUMP_CONDITIONAL, INSN_JUMP_UNCONDITIONAL, INSN_JUMP_DYNAMIC, + INSN_JUMP_DYNAMIC_CONDITIONAL, INSN_CALL, INSN_CALL_DYNAMIC, INSN_RETURN, diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 04572a049cfc..5f26620f13f5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -575,7 +575,11 @@ static int add_jump_destinations(struct objtool_file *file) * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. */ - insn->type = INSN_JUMP_DYNAMIC; + if (insn->type == INSN_JUMP_UNCONDITIONAL) + insn->type = INSN_JUMP_DYNAMIC; + else + insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; + insn->retpoline_safe = true; continue; } else { @@ -2114,13 +2118,17 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: if (func && is_sibling_call(insn)) { ret = validate_sibling_call(insn, &state); if (ret) return ret; } - return 0; + if (insn->type == INSN_JUMP_DYNAMIC) + return 0; + + break; case INSN_CONTEXT_SWITCH: if (func && (!next_insn || !next_insn->hint)) { -- cgit v1.2.3 From 01a0f9e4496d9f54e06abb71bf9f56c617ef8c24 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 18 Jul 2019 11:13:35 +0200 Subject: selftests/bpf: fix "valid read map access into a read-only array 1" on s390 This test looks up a 32-bit map element and then loads it using a 64-bit load. This does not work on s390, which is a big-endian machine. Since the point of this test doesn't seem to be loading a smaller value using a larger load, simply use a 32-bit load. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/array_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index bcb83196e459..f3c33e128709 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -226,7 +226,7 @@ BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_array_ro = { 3 }, -- cgit v1.2.3 From 59fd3486c3dd5678bc2fcac75e14466775465c3e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 17 Jul 2019 14:26:20 +0200 Subject: selftests/bpf: fix test_xdp_noinline on s390 test_xdp_noinline fails on s390 due to a handful of endianness issues. Use ntohs for parsing eth_proto. Replace bswaps with ntohs/htons. Signed-off-by: Ilya Leoshkevich Acked-by: Vasily Gorbik Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_xdp_noinline.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index dad8a7e33eaa..e88d7b9d65ab 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -14,6 +14,7 @@ #include #include #include "bpf_helpers.h" +#include "bpf_endian.h" static __u32 rol32(__u32 word, unsigned int shift) { @@ -305,7 +306,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, ip6h->nexthdr = IPPROTO_IPV6; ip_suffix = pckt->flow.srcv6[3] ^ pckt->flow.port16[0]; ip6h->payload_len = - __builtin_bswap16(pkt_bytes + sizeof(struct ipv6hdr)); + bpf_htons(pkt_bytes + sizeof(struct ipv6hdr)); ip6h->hop_limit = 4; ip6h->saddr.in6_u.u6_addr32[0] = 1; @@ -322,7 +323,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct real_definition *dst, __u32 pkt_bytes) { - __u32 ip_suffix = __builtin_bswap16(pckt->flow.port16[0]); + __u32 ip_suffix = bpf_ntohs(pckt->flow.port16[0]); struct eth_hdr *new_eth; struct eth_hdr *old_eth; __u16 *next_iph_u16; @@ -352,7 +353,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, iph->protocol = IPPROTO_IPIP; iph->check = 0; iph->tos = 1; - iph->tot_len = __builtin_bswap16(pkt_bytes + sizeof(struct iphdr)); + iph->tot_len = bpf_htons(pkt_bytes + sizeof(struct iphdr)); /* don't update iph->daddr, since it will overwrite old eth_proto * and multiple iterations of bpf_prog_run() will fail */ @@ -639,7 +640,7 @@ static int process_l3_headers_v6(struct packet_description *pckt, iph_len = sizeof(struct ipv6hdr); *protocol = ip6h->nexthdr; pckt->flow.proto = *protocol; - *pkt_bytes = __builtin_bswap16(ip6h->payload_len); + *pkt_bytes = bpf_ntohs(ip6h->payload_len); off += iph_len; if (*protocol == 45) { return XDP_DROP; @@ -671,7 +672,7 @@ static int process_l3_headers_v4(struct packet_description *pckt, return XDP_DROP; *protocol = iph->protocol; pckt->flow.proto = *protocol; - *pkt_bytes = __builtin_bswap16(iph->tot_len); + *pkt_bytes = bpf_ntohs(iph->tot_len); off += 20; if (iph->frag_off & 65343) return XDP_DROP; @@ -808,10 +809,10 @@ int balancer_ingress(struct xdp_md *ctx) nh_off = sizeof(struct eth_hdr); if (data + nh_off > data_end) return XDP_DROP; - eth_proto = eth->eth_proto; - if (eth_proto == 8) + eth_proto = bpf_ntohs(eth->eth_proto); + if (eth_proto == ETH_P_IP) return process_packet(data, nh_off, data_end, 0, ctx); - else if (eth_proto == 56710) + else if (eth_proto == ETH_P_IPV6) return process_packet(data, nh_off, data_end, 1, ctx); else return XDP_DROP; -- cgit v1.2.3