summaryrefslogtreecommitdiff
path: root/Documentation/trace
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-09-23 23:08:43 +0200
committerFrederic Weisbecker <fweisbec@gmail.com>2009-09-23 23:08:43 +0200
commitd7a4b414eed51f1653bb05ebe84122bf9a7ae18b (patch)
treebd6603a0c27de4c138a1767871897e9cd3e1a1d2 /Documentation/trace
parent1f0ab40976460bc4673fa204ce917a725185d8f2 (diff)
parenta724eada8c2a7b62463b73ccf73fd0bb6e928aeb (diff)
downloadlwn-d7a4b414eed51f1653bb05ebe84122bf9a7ae18b.tar.gz
lwn-d7a4b414eed51f1653bb05ebe84122bf9a7ae18b.zip
Merge commit 'linus/master' into tracing/kprobes
Conflicts: kernel/trace/Makefile kernel/trace/trace.h kernel/trace/trace_event_types.h kernel/trace/trace_export.c Merge reason: Sync with latest significant tracing core changes.
Diffstat (limited to 'Documentation/trace')
-rw-r--r--Documentation/trace/events-kmem.txt107
-rw-r--r--Documentation/trace/events.txt210
-rw-r--r--Documentation/trace/ftrace-design.txt233
-rw-r--r--Documentation/trace/ftrace.txt8
-rw-r--r--Documentation/trace/postprocess/trace-pagealloc-postprocess.pl418
-rw-r--r--Documentation/trace/power.txt17
-rw-r--r--Documentation/trace/tracepoint-analysis.txt327
7 files changed, 1288 insertions, 32 deletions
diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt
new file mode 100644
index 000000000000..6ef2a8652e17
--- /dev/null
+++ b/Documentation/trace/events-kmem.txt
@@ -0,0 +1,107 @@
+ Subsystem Trace Points: kmem
+
+The tracing system kmem captures events related to object and page allocation
+within the kernel. Broadly speaking there are four major subheadings.
+
+ o Slab allocation of small objects of unknown type (kmalloc)
+ o Slab allocation of small objects of known type
+ o Page allocation
+ o Per-CPU Allocator Activity
+ o External Fragmentation
+
+This document will describe what each of the tracepoints are and why they
+might be useful.
+
+1. Slab allocation of small objects of unknown type
+===================================================
+kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
+kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
+kfree call_site=%lx ptr=%p
+
+Heavy activity for these events may indicate that a specific cache is
+justified, particularly if kmalloc slab pages are getting significantly
+internal fragmented as a result of the allocation pattern. By correlating
+kmalloc with kfree, it may be possible to identify memory leaks and where
+the allocation sites were.
+
+
+2. Slab allocation of small objects of known type
+=================================================
+kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
+kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
+kmem_cache_free call_site=%lx ptr=%p
+
+These events are similar in usage to the kmalloc-related events except that
+it is likely easier to pin the event down to a specific cache. At the time
+of writing, no information is available on what slab is being allocated from,
+but the call_site can usually be used to extrapolate that information
+
+3. Page allocation
+==================
+mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s
+mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
+mm_page_free_direct page=%p pfn=%lu order=%d
+mm_pagevec_free page=%p pfn=%lu order=%d cold=%d
+
+These four events deal with page allocation and freeing. mm_page_alloc is
+a simple indicator of page allocator activity. Pages may be allocated from
+the per-CPU allocator (high performance) or the buddy allocator.
+
+If pages are allocated directly from the buddy allocator, the
+mm_page_alloc_zone_locked event is triggered. This event is important as high
+amounts of activity imply high activity on the zone->lock. Taking this lock
+impairs performance by disabling interrupts, dirtying cache lines between
+CPUs and serialising many CPUs.
+
+When a page is freed directly by the caller, the mm_page_free_direct event
+is triggered. Significant amounts of activity here could indicate that the
+callers should be batching their activities.
+
+When pages are freed using a pagevec, the mm_pagevec_free is
+triggered. Broadly speaking, pages are taken off the LRU lock in bulk and
+freed in batch with a pagevec. Significant amounts of activity here could
+indicate that the system is under memory pressure and can also indicate
+contention on the zone->lru_lock.
+
+4. Per-CPU Allocator Activity
+=============================
+mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
+mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d
+
+In front of the page allocator is a per-cpu page allocator. It exists only
+for order-0 pages, reduces contention on the zone->lock and reduces the
+amount of writing on struct page.
+
+When a per-CPU list is empty or pages of the wrong type are allocated,
+the zone->lock will be taken once and the per-CPU list refilled. The event
+triggered is mm_page_alloc_zone_locked for each page allocated with the
+event indicating whether it is for a percpu_refill or not.
+
+When the per-CPU list is too full, a number of pages are freed, each one
+which triggers a mm_page_pcpu_drain event.
+
+The individual nature of the events are so that pages can be tracked
+between allocation and freeing. A number of drain or refill pages that occur
+consecutively imply the zone->lock being taken once. Large amounts of PCP
+refills and drains could imply an imbalance between CPUs where too much work
+is being concentrated in one place. It could also indicate that the per-CPU
+lists should be a larger size. Finally, large amounts of refills on one CPU
+and drains on another could be a factor in causing large amounts of cache
+line bounces due to writes between CPUs and worth investigating if pages
+can be allocated and freed on the same CPU through some algorithm change.
+
+5. External Fragmentation
+=========================
+mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d
+
+External fragmentation affects whether a high-order allocation will be
+successful or not. For some types of hardware, this is important although
+it is avoided where possible. If the system is using huge pages and needs
+to be able to resize the pool over the lifetime of the system, this value
+is important.
+
+Large numbers of this event implies that memory is fragmenting and
+high-order allocations will start failing at some time in the future. One
+means of reducing the occurange of this event is to increase the size of
+min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where
+pageblock_size is usually the size of the default hugepage size.
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 2bcc8d4dea29..02ac6ed38b2d 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -1,7 +1,7 @@
Event Tracing
Documentation written by Theodore Ts'o
- Updated by Li Zefan
+ Updated by Li Zefan and Tom Zanussi
1. Introduction
===============
@@ -22,12 +22,12 @@ tracing information should be printed.
---------------------------------
The events which are available for tracing can be found in the file
-/debug/tracing/available_events.
+/sys/kernel/debug/tracing/available_events.
To enable a particular event, such as 'sched_wakeup', simply echo it
-to /debug/tracing/set_event. For example:
+to /sys/kernel/debug/tracing/set_event. For example:
- # echo sched_wakeup >> /debug/tracing/set_event
+ # echo sched_wakeup >> /sys/kernel/debug/tracing/set_event
[ Note: '>>' is necessary, otherwise it will firstly disable
all the events. ]
@@ -35,15 +35,15 @@ to /debug/tracing/set_event. For example:
To disable an event, echo the event name to the set_event file prefixed
with an exclamation point:
- # echo '!sched_wakeup' >> /debug/tracing/set_event
+ # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
To disable all events, echo an empty line to the set_event file:
- # echo > /debug/tracing/set_event
+ # echo > /sys/kernel/debug/tracing/set_event
To enable all events, echo '*:*' or '*:' to the set_event file:
- # echo *:* > /debug/tracing/set_event
+ # echo *:* > /sys/kernel/debug/tracing/set_event
The events are organized into subsystems, such as ext4, irq, sched,
etc., and a full event name looks like this: <subsystem>:<event>. The
@@ -52,29 +52,29 @@ file. All of the events in a subsystem can be specified via the syntax
"<subsystem>:*"; for example, to enable all irq events, you can use the
command:
- # echo 'irq:*' > /debug/tracing/set_event
+ # echo 'irq:*' > /sys/kernel/debug/tracing/set_event
2.2 Via the 'enable' toggle
---------------------------
-The events available are also listed in /debug/tracing/events/ hierarchy
+The events available are also listed in /sys/kernel/debug/tracing/events/ hierarchy
of directories.
To enable event 'sched_wakeup':
- # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+ # echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
To disable it:
- # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+ # echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
To enable all events in sched subsystem:
- # echo 1 > /debug/tracing/events/sched/enable
+ # echo 1 > /sys/kernel/debug/tracing/events/sched/enable
-To eanble all events:
+To enable all events:
- # echo 1 > /debug/tracing/events/enable
+ # echo 1 > /sys/kernel/debug/tracing/events/enable
When reading one of these enable files, there are four results:
@@ -97,3 +97,185 @@ The format of this boot option is the same as described in section 2.1.
See The example provided in samples/trace_events
+4. Event formats
+================
+
+Each trace event has a 'format' file associated with it that contains
+a description of each field in a logged event. This information can
+be used to parse the binary trace stream, and is also the place to
+find the field names that can be used in event filters (see section 5).
+
+It also displays the format string that will be used to print the
+event in text mode, along with the event name and ID used for
+profiling.
+
+Every event has a set of 'common' fields associated with it; these are
+the fields prefixed with 'common_'. The other fields vary between
+events and correspond to the fields defined in the TRACE_EVENT
+definition for that event.
+
+Each field in the format has the form:
+
+ field:field-type field-name; offset:N; size:N;
+
+where offset is the offset of the field in the trace record and size
+is the size of the data item, in bytes.
+
+For example, here's the information displayed for the 'sched_wakeup'
+event:
+
+# cat /debug/tracing/events/sched/sched_wakeup/format
+
+name: sched_wakeup
+ID: 60
+format:
+ field:unsigned short common_type; offset:0; size:2;
+ field:unsigned char common_flags; offset:2; size:1;
+ field:unsigned char common_preempt_count; offset:3; size:1;
+ field:int common_pid; offset:4; size:4;
+ field:int common_tgid; offset:8; size:4;
+
+ field:char comm[TASK_COMM_LEN]; offset:12; size:16;
+ field:pid_t pid; offset:28; size:4;
+ field:int prio; offset:32; size:4;
+ field:int success; offset:36; size:4;
+ field:int cpu; offset:40; size:4;
+
+print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid,
+ REC->prio, REC->success, REC->cpu
+
+This event contains 10 fields, the first 5 common and the remaining 5
+event-specific. All the fields for this event are numeric, except for
+'comm' which is a string, a distinction important for event filtering.
+
+5. Event filtering
+==================
+
+Trace events can be filtered in the kernel by associating boolean
+'filter expressions' with them. As soon as an event is logged into
+the trace buffer, its fields are checked against the filter expression
+associated with that event type. An event with field values that
+'match' the filter will appear in the trace output, and an event whose
+values don't match will be discarded. An event with no filter
+associated with it matches everything, and is the default when no
+filter has been set for an event.
+
+5.1 Expression syntax
+---------------------
+
+A filter expression consists of one or more 'predicates' that can be
+combined using the logical operators '&&' and '||'. A predicate is
+simply a clause that compares the value of a field contained within a
+logged event with a constant value and returns either 0 or 1 depending
+on whether the field value matched (1) or didn't match (0):
+
+ field-name relational-operator value
+
+Parentheses can be used to provide arbitrary logical groupings and
+double-quotes can be used to prevent the shell from interpreting
+operators as shell metacharacters.
+
+The field-names available for use in filters can be found in the
+'format' files for trace events (see section 4).
+
+The relational-operators depend on the type of the field being tested:
+
+The operators available for numeric fields are:
+
+==, !=, <, <=, >, >=
+
+And for string fields they are:
+
+==, !=
+
+Currently, only exact string matches are supported.
+
+Currently, the maximum number of predicates in a filter is 16.
+
+5.2 Setting filters
+-------------------
+
+A filter for an individual event is set by writing a filter expression
+to the 'filter' file for the given event.
+
+For example:
+
+# cd /debug/tracing/events/sched/sched_wakeup
+# echo "common_preempt_count > 4" > filter
+
+A slightly more involved example:
+
+# cd /debug/tracing/events/sched/sched_signal_send
+# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
+
+If there is an error in the expression, you'll get an 'Invalid
+argument' error when setting it, and the erroneous string along with
+an error message can be seen by looking at the filter e.g.:
+
+# cd /debug/tracing/events/sched/sched_signal_send
+# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter
+-bash: echo: write error: Invalid argument
+# cat filter
+((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
+^
+parse_error: Field not found
+
+Currently the caret ('^') for an error always appears at the beginning of
+the filter string; the error message should still be useful though
+even without more accurate position info.
+
+5.3 Clearing filters
+--------------------
+
+To clear the filter for an event, write a '0' to the event's filter
+file.
+
+To clear the filters for all events in a subsystem, write a '0' to the
+subsystem's filter file.
+
+5.3 Subsystem filters
+---------------------
+
+For convenience, filters for every event in a subsystem can be set or
+cleared as a group by writing a filter expression into the filter file
+at the root of the subsytem. Note however, that if a filter for any
+event within the subsystem lacks a field specified in the subsystem
+filter, or if the filter can't be applied for any other reason, the
+filter for that event will retain its previous setting. This can
+result in an unintended mixture of filters which could lead to
+confusing (to the user who might think different filters are in
+effect) trace output. Only filters that reference just the common
+fields can be guaranteed to propagate successfully to all events.
+
+Here are a few subsystem filter examples that also illustrate the
+above points:
+
+Clear the filters on all events in the sched subsytem:
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo 0 > filter
+# cat sched_switch/filter
+none
+# cat sched_wakeup/filter
+none
+
+Set a filter using only common fields for all events in the sched
+subsytem (all events end up with the same filter):
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo common_pid == 0 > filter
+# cat sched_switch/filter
+common_pid == 0
+# cat sched_wakeup/filter
+common_pid == 0
+
+Attempt to set a filter using a non-common field for all events in the
+sched subsytem (all events but those that have a prev_pid field retain
+their old filters):
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo prev_pid == 0 > filter
+# cat sched_switch/filter
+prev_pid == 0
+# cat sched_wakeup/filter
+common_pid == 0
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
new file mode 100644
index 000000000000..7003e10f10f5
--- /dev/null
+++ b/Documentation/trace/ftrace-design.txt
@@ -0,0 +1,233 @@
+ function tracer guts
+ ====================
+
+Introduction
+------------
+
+Here we will cover the architecture pieces that the common function tracing
+code relies on for proper functioning. Things are broken down into increasing
+complexity so that you can start simple and at least get basic functionality.
+
+Note that this focuses on architecture implementation details only. If you
+want more explanation of a feature in terms of common code, review the common
+ftrace.txt file.
+
+
+Prerequisites
+-------------
+
+Ftrace relies on these features being implemented:
+ STACKTRACE_SUPPORT - implement save_stack_trace()
+ TRACE_IRQFLAGS_SUPPORT - implement include/asm/irqflags.h
+
+
+HAVE_FUNCTION_TRACER
+--------------------
+
+You will need to implement the mcount and the ftrace_stub functions.
+
+The exact mcount symbol name will depend on your toolchain. Some call it
+"mcount", "_mcount", or even "__mcount". You can probably figure it out by
+running something like:
+ $ echo 'main(){}' | gcc -x c -S -o - - -pg | grep mcount
+ call mcount
+We'll make the assumption below that the symbol is "mcount" just to keep things
+nice and simple in the examples.
+
+Keep in mind that the ABI that is in effect inside of the mcount function is
+*highly* architecture/toolchain specific. We cannot help you in this regard,
+sorry. Dig up some old documentation and/or find someone more familiar than
+you to bang ideas off of. Typically, register usage (argument/scratch/etc...)
+is a major issue at this point, especially in relation to the location of the
+mcount call (before/after function prologue). You might also want to look at
+how glibc has implemented the mcount function for your architecture. It might
+be (semi-)relevant.
+
+The mcount function should check the function pointer ftrace_trace_function
+to see if it is set to ftrace_stub. If it is, there is nothing for you to do,
+so return immediately. If it isn't, then call that function in the same way
+the mcount function normally calls __mcount_internal -- the first argument is
+the "frompc" while the second argument is the "selfpc" (adjusted to remove the
+size of the mcount call that is embedded in the function).
+
+For example, if the function foo() calls bar(), when the bar() function calls
+mcount(), the arguments mcount() will pass to the tracer are:
+ "frompc" - the address bar() will use to return to foo()
+ "selfpc" - the address bar() (with _mcount() size adjustment)
+
+Also keep in mind that this mcount function will be called *a lot*, so
+optimizing for the default case of no tracer will help the smooth running of
+your system when tracing is disabled. So the start of the mcount function is
+typically the bare min with checking things before returning. That also means
+the code flow should usually kept linear (i.e. no branching in the nop case).
+This is of course an optimization and not a hard requirement.
+
+Here is some pseudo code that should help (these functions should actually be
+implemented in assembly):
+
+void ftrace_stub(void)
+{
+ return;
+}
+
+void mcount(void)
+{
+ /* save any bare state needed in order to do initial checking */
+
+ extern void (*ftrace_trace_function)(unsigned long, unsigned long);
+ if (ftrace_trace_function != ftrace_stub)
+ goto do_trace;
+
+ /* restore any bare state */
+
+ return;
+
+do_trace:
+
+ /* save all state needed by the ABI (see paragraph above) */
+
+ unsigned long frompc = ...;
+ unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+ ftrace_trace_function(frompc, selfpc);
+
+ /* restore all state needed by the ABI */
+}
+
+Don't forget to export mcount for modules !
+extern void mcount(void);
+EXPORT_SYMBOL(mcount);
+
+
+HAVE_FUNCTION_TRACE_MCOUNT_TEST
+-------------------------------
+
+This is an optional optimization for the normal case when tracing is turned off
+in the system. If you do not enable this Kconfig option, the common ftrace
+code will take care of doing the checking for you.
+
+To support this feature, you only need to check the function_trace_stop
+variable in the mcount function. If it is non-zero, there is no tracing to be
+done at all, so you can return.
+
+This additional pseudo code would simply be:
+void mcount(void)
+{
+ /* save any bare state needed in order to do initial checking */
+
++ if (function_trace_stop)
++ return;
+
+ extern void (*ftrace_trace_function)(unsigned long, unsigned long);
+ if (ftrace_trace_function != ftrace_stub)
+...
+
+
+HAVE_FUNCTION_GRAPH_TRACER
+--------------------------
+
+Deep breath ... time to do some real work. Here you will need to update the
+mcount function to check ftrace graph function pointers, as well as implement
+some functions to save (hijack) and restore the return address.
+
+The mcount function should check the function pointers ftrace_graph_return
+(compare to ftrace_stub) and ftrace_graph_entry (compare to
+ftrace_graph_entry_stub). If either of those are not set to the relevant stub
+function, call the arch-specific function ftrace_graph_caller which in turn
+calls the arch-specific function prepare_ftrace_return. Neither of these
+function names are strictly required, but you should use them anyways to stay
+consistent across the architecture ports -- easier to compare & contrast
+things.
+
+The arguments to prepare_ftrace_return are slightly different than what are
+passed to ftrace_trace_function. The second argument "selfpc" is the same,
+but the first argument should be a pointer to the "frompc". Typically this is
+located on the stack. This allows the function to hijack the return address
+temporarily to have it point to the arch-specific function return_to_handler.
+That function will simply call the common ftrace_return_to_handler function and
+that will return the original return address with which, you can return to the
+original call site.
+
+Here is the updated mcount pseudo code:
+void mcount(void)
+{
+...
+ if (ftrace_trace_function != ftrace_stub)
+ goto do_trace;
+
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++ extern void (*ftrace_graph_return)(...);
++ extern void (*ftrace_graph_entry)(...);
++ if (ftrace_graph_return != ftrace_stub ||
++ ftrace_graph_entry != ftrace_graph_entry_stub)
++ ftrace_graph_caller();
++#endif
+
+ /* restore any bare state */
+...
+
+Here is the pseudo code for the new ftrace_graph_caller assembly function:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void ftrace_graph_caller(void)
+{
+ /* save all state needed by the ABI */
+
+ unsigned long *frompc = &...;
+ unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+ prepare_ftrace_return(frompc, selfpc);
+
+ /* restore all state needed by the ABI */
+}
+#endif
+
+For information on how to implement prepare_ftrace_return(), simply look at
+the x86 version. The only architecture-specific piece in it is the setup of
+the fault recovery table (the asm(...) code). The rest should be the same
+across architectures.
+
+Here is the pseudo code for the new return_to_handler assembly function. Note
+that the ABI that applies here is different from what applies to the mcount
+code. Since you are returning from a function (after the epilogue), you might
+be able to skimp on things saved/restored (usually just registers used to pass
+return values).
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void return_to_handler(void)
+{
+ /* save all state needed by the ABI (see paragraph above) */
+
+ void (*original_return_point)(void) = ftrace_return_to_handler();
+
+ /* restore all state needed by the ABI */
+
+ /* this is usually either a return or a jump */
+ original_return_point();
+}
+#endif
+
+
+HAVE_FTRACE_NMI_ENTER
+---------------------
+
+If you can't trace NMI functions, then skip this option.
+
+<details to be filled>
+
+
+HAVE_FTRACE_SYSCALLS
+---------------------
+
+<details to be filled>
+
+
+HAVE_FTRACE_MCOUNT_RECORD
+-------------------------
+
+See scripts/recordmcount.pl for more info.
+
+<details to be filled>
+
+
+HAVE_DYNAMIC_FTRACE
+---------------------
+
+<details to be filled>
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 355d0f1f8c50..957b22fde2df 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -26,6 +26,12 @@ disabled, and more (ftrace allows for tracer plugins, which
means that the list of tracers can always grow).
+Implementation Details
+----------------------
+
+See ftrace-design.txt for details for arch porters and such.
+
+
The File System
---------------
@@ -127,7 +133,7 @@ of ftrace. Here is a list of some of the key files:
than requested, the rest of the page will be used,
making the actual allocation bigger than requested.
( Note, the size may not be a multiple of the page size
- due to buffer managment overhead. )
+ due to buffer management overhead. )
This can only be updated when the current_tracer
is set to "nop".
diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
new file mode 100644
index 000000000000..7df50e8cf4d9
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
@@ -0,0 +1,418 @@
+#!/usr/bin/perl
+# This is a POC (proof of concept or piece of crap, take your pick) for reading the
+# text representation of trace output related to page allocation. It makes an attempt
+# to extract some high-level information on what is going on. The accuracy of the parser
+# may vary considerably
+#
+# Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
+# other options
+# --prepend-parent Report on the parent proc and PID
+# --read-procstat If the trace lacks process info, get it from /proc
+# --ignore-pid Aggregate processes of the same name together
+#
+# Copyright (c) IBM Corporation 2009
+# Author: Mel Gorman <mel@csn.ul.ie>
+use strict;
+use Getopt::Long;
+
+# Tracepoint events
+use constant MM_PAGE_ALLOC => 1;
+use constant MM_PAGE_FREE_DIRECT => 2;
+use constant MM_PAGEVEC_FREE => 3;
+use constant MM_PAGE_PCPU_DRAIN => 4;
+use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5;
+use constant MM_PAGE_ALLOC_EXTFRAG => 6;
+use constant EVENT_UNKNOWN => 7;
+
+# Constants used to track state
+use constant STATE_PCPU_PAGES_DRAINED => 8;
+use constant STATE_PCPU_PAGES_REFILLED => 9;
+
+# High-level events extrapolated from tracepoints
+use constant HIGH_PCPU_DRAINS => 10;
+use constant HIGH_PCPU_REFILLS => 11;
+use constant HIGH_EXT_FRAGMENT => 12;
+use constant HIGH_EXT_FRAGMENT_SEVERE => 13;
+use constant HIGH_EXT_FRAGMENT_MODERATE => 14;
+use constant HIGH_EXT_FRAGMENT_CHANGED => 15;
+
+my %perprocesspid;
+my %perprocess;
+my $opt_ignorepid;
+my $opt_read_procstat;
+my $opt_prepend_parent;
+
+# Catch sigint and exit on request
+my $sigint_report = 0;
+my $sigint_exit = 0;
+my $sigint_pending = 0;
+my $sigint_received = 0;
+sub sigint_handler {
+ my $current_time = time;
+ if ($current_time - 2 > $sigint_received) {
+ print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
+ $sigint_report = 1;
+ } else {
+ if (!$sigint_exit) {
+ print "Second SIGINT received quickly, exiting\n";
+ }
+ $sigint_exit++;
+ }
+
+ if ($sigint_exit > 3) {
+ print "Many SIGINTs received, exiting now without report\n";
+ exit;
+ }
+
+ $sigint_received = $current_time;
+ $sigint_pending = 1;
+}
+$SIG{INT} = "sigint_handler";
+
+# Parse command line options
+GetOptions(
+ 'ignore-pid' => \$opt_ignorepid,
+ 'read-procstat' => \$opt_read_procstat,
+ 'prepend-parent' => \$opt_prepend_parent,
+);
+
+# Defaults for dynamically discovered regex's
+my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])';
+
+# Dyanically discovered regex
+my $regex_fragdetails;
+
+# Static regex used. Specified like this for readability and for use with /o
+# (process_pid) (cpus ) ( time ) (tpoint ) (details)
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_statname = '[-0-9]*\s\((.*)\).*';
+my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
+
+sub generate_traceevent_regex {
+ my $event = shift;
+ my $default = shift;
+ my $regex;
+
+ # Read the event format or use the default
+ if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
+ $regex = $default;
+ } else {
+ my $line;
+ while (!eof(FORMAT)) {
+ $line = <FORMAT>;
+ if ($line =~ /^print fmt:\s"(.*)",.*/) {
+ $regex = $1;
+ $regex =~ s/%p/\([0-9a-f]*\)/g;
+ $regex =~ s/%d/\([-0-9]*\)/g;
+ $regex =~ s/%lu/\([0-9]*\)/g;
+ }
+ }
+ }
+
+ # Verify fields are in the right order
+ my $tuple;
+ foreach $tuple (split /\s/, $regex) {
+ my ($key, $value) = split(/=/, $tuple);
+ my $expected = shift;
+ if ($key ne $expected) {
+ print("WARNING: Format not as expected '$key' != '$expected'");
+ $regex =~ s/$key=\((.*)\)/$key=$1/;
+ }
+ }
+
+ if (defined shift) {
+ die("Fewer fields than expected in format");
+ }
+
+ return $regex;
+}
+$regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag",
+ $regex_fragdetails_default,
+ "page", "pfn",
+ "alloc_order", "fallback_order", "pageblock_order",
+ "alloc_migratetype", "fallback_migratetype",
+ "fragmenting", "change_ownership");
+
+sub read_statline($) {
+ my $pid = $_[0];
+ my $statline;
+
+ if (open(STAT, "/proc/$pid/stat")) {
+ $statline = <STAT>;
+ close(STAT);
+ }
+
+ if ($statline eq '') {
+ $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
+ }
+
+ return $statline;
+}
+
+sub guess_process_pid($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+
+ if ($pid == 0) {
+ return "swapper-0";
+ }
+
+ if ($statline !~ /$regex_statname/o) {
+ die("Failed to math stat line for process name :: $statline");
+ }
+ return "$1-$pid";
+}
+
+sub parent_info($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+ my $ppid;
+
+ if ($pid == 0) {
+ return "NOPARENT-0";
+ }
+
+ if ($statline !~ /$regex_statppid/o) {
+ die("Failed to match stat line process ppid:: $statline");
+ }
+
+ # Read the ppid stat line
+ $ppid = $1;
+ return guess_process_pid($ppid, read_statline($ppid));
+}
+
+sub process_events {
+ my $traceevent;
+ my $process_pid;
+ my $cpus;
+ my $timestamp;
+ my $tracepoint;
+ my $details;
+ my $statline;
+
+ # Read each line of the event log
+EVENT_PROCESS:
+ while ($traceevent = <STDIN>) {
+ if ($traceevent =~ /$regex_traceevent/o) {
+ $process_pid = $1;
+ $tracepoint = $4;
+
+ if ($opt_read_procstat || $opt_prepend_parent) {
+ $process_pid =~ /(.*)-([0-9]*)$/;
+ my $process = $1;
+ my $pid = $2;
+
+ $statline = read_statline($pid);
+
+ if ($opt_read_procstat && $process eq '') {
+ $process_pid = guess_process_pid($pid, $statline);
+ }
+
+ if ($opt_prepend_parent) {
+ $process_pid = parent_info($pid, $statline) . " :: $process_pid";
+ }
+ }
+
+ # Unnecessary in this script. Uncomment if required
+ # $cpus = $2;
+ # $timestamp = $3;
+ } else {
+ next;
+ }
+
+ # Perl Switch() sucks majorly
+ if ($tracepoint eq "mm_page_alloc") {
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++;
+ } elsif ($tracepoint eq "mm_page_free_direct") {
+ $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++;
+ } elsif ($tracepoint eq "mm_pagevec_free") {
+ $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++;
+ } elsif ($tracepoint eq "mm_page_pcpu_drain") {
+ $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++;
+ } elsif ($tracepoint eq "mm_page_alloc_zone_locked") {
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++;
+ } elsif ($tracepoint eq "mm_page_alloc_extfrag") {
+
+ # Extract the details of the event now
+ $details = $5;
+
+ my ($page, $pfn);
+ my ($alloc_order, $fallback_order, $pageblock_order);
+ my ($alloc_migratetype, $fallback_migratetype);
+ my ($fragmenting, $change_ownership);
+
+ if ($details !~ /$regex_fragdetails/o) {
+ print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n";
+ next;
+ }
+
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++;
+ $page = $1;
+ $pfn = $2;
+ $alloc_order = $3;
+ $fallback_order = $4;
+ $pageblock_order = $5;
+ $alloc_migratetype = $6;
+ $fallback_migratetype = $7;
+ $fragmenting = $8;
+ $change_ownership = $9;
+
+ if ($fragmenting) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++;
+ if ($fallback_order <= 3) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++;
+ } else {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++;
+ }
+ }
+ if ($change_ownership) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++;
+ }
+ } else {
+ $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
+ }
+
+ # Catch a full pcpu drain event
+ if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} &&
+ $tracepoint ne "mm_page_pcpu_drain") {
+
+ $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
+ }
+
+ # Catch a full pcpu refill event
+ if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} &&
+ $tracepoint ne "mm_page_alloc_zone_locked") {
+ $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
+ }
+
+ if ($sigint_pending) {
+ last EVENT_PROCESS;
+ }
+ }
+}
+
+sub dump_stats {
+ my $hashref = shift;
+ my %stats = %$hashref;
+
+ # Dump per-process stats
+ my $process_pid;
+ my $max_strlen = 0;
+
+ # Get the maximum process name
+ foreach $process_pid (keys %perprocesspid) {
+ my $len = length($process_pid);
+ if ($len > $max_strlen) {
+ $max_strlen = $len;
+ }
+ }
+ $max_strlen += 2;
+
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", "");
+
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", "");
+
+ foreach $process_pid (keys %stats) {
+ # Dump final aggregates
+ if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) {
+ $stats{$process_pid}->{HIGH_PCPU_DRAINS}++;
+ $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
+ }
+ if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) {
+ $stats{$process_pid}->{HIGH_PCPU_REFILLS}++;
+ $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
+ }
+
+ printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n",
+ $process_pid,
+ $stats{$process_pid}->{MM_PAGE_ALLOC},
+ $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED},
+ $stats{$process_pid}->{MM_PAGE_FREE_DIRECT},
+ $stats{$process_pid}->{MM_PAGEVEC_FREE},
+ $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN},
+ $stats{$process_pid}->{HIGH_PCPU_DRAINS},
+ $stats{$process_pid}->{HIGH_PCPU_REFILLS},
+ $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG},
+ $stats{$process_pid}->{HIGH_EXT_FRAG},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE},
+ $stats{$process_pid}->{EVENT_UNKNOWN});
+ }
+}
+
+sub aggregate_perprocesspid() {
+ my $process_pid;
+ my $process;
+ undef %perprocess;
+
+ foreach $process_pid (keys %perprocesspid) {
+ $process = $process_pid;
+ $process =~ s/-([0-9])*$//;
+ if ($process eq '') {
+ $process = "NO_PROCESS_NAME";
+ }
+
+ $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC};
+ $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED};
+ $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT};
+ $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE};
+ $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN};
+ $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS};
+ $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS};
+ $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG};
+ $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE};
+ $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN};
+ }
+}
+
+sub report() {
+ if (!$opt_ignorepid) {
+ dump_stats(\%perprocesspid);
+ } else {
+ aggregate_perprocesspid();
+ dump_stats(\%perprocess);
+ }
+}
+
+# Process events or signals until neither is available
+sub signal_loop() {
+ my $sigint_processed;
+ do {
+ $sigint_processed = 0;
+ process_events();
+
+ # Handle pending signals if any
+ if ($sigint_pending) {
+ my $current_time = time;
+
+ if ($sigint_exit) {
+ print "Received exit signal\n";
+ $sigint_pending = 0;
+ }
+ if ($sigint_report) {
+ if ($current_time >= $sigint_received + 2) {
+ report();
+ $sigint_report = 0;
+ $sigint_pending = 0;
+ $sigint_processed = 1;
+ }
+ }
+ }
+ } while ($sigint_pending || $sigint_processed);
+}
+
+signal_loop();
+report();
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
deleted file mode 100644
index cd805e16dc27..000000000000
--- a/Documentation/trace/power.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-The power tracer collects detailed information about C-state and P-state
-transitions, instead of just looking at the high-level "average"
-information.
-
-There is a helper script found in scrips/tracing/power.pl in the kernel
-sources which can be used to parse this information and create a
-Scalable Vector Graphics (SVG) picture from the trace data.
-
-To use this tracer:
-
- echo 0 > /sys/kernel/debug/tracing/tracing_enabled
- echo power > /sys/kernel/debug/tracing/current_tracer
- echo 1 > /sys/kernel/debug/tracing/tracing_enabled
- sleep 1
- echo 0 > /sys/kernel/debug/tracing/tracing_enabled
- cat /sys/kernel/debug/tracing/trace | \
- perl scripts/tracing/power.pl > out.sv
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt
new file mode 100644
index 000000000000..5eb4e487e667
--- /dev/null
+++ b/Documentation/trace/tracepoint-analysis.txt
@@ -0,0 +1,327 @@
+ Notes on Analysing Behaviour Using Events and Tracepoints
+
+ Documentation written by Mel Gorman
+ PCL information heavily based on email from Ingo Molnar
+
+1. Introduction
+===============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used without
+creating custom kernel modules to register probe functions using the event
+tracing infrastructure.
+
+Simplistically, tracepoints will represent an important event that when can
+be taken in conjunction with other tracepoints to build a "Big Picture" of
+what is going on within the system. There are a large number of methods for
+gathering and interpreting these events. Lacking any current Best Practises,
+this document describes some of the methods that can be used.
+
+This document assumes that debugfs is mounted on /sys/kernel/debug and that
+the appropriate tracing options have been configured into the kernel. It is
+assumed that the PCL tool tools/perf has been installed and is in your path.
+
+2. Listing Available Events
+===========================
+
+2.1 Standard Utilities
+----------------------
+
+All possible events are visible from /sys/kernel/debug/tracing/events. Simply
+calling
+
+ $ find /sys/kernel/debug/tracing/events -type d
+
+will give a fair indication of the number of events available.
+
+2.2 PCL
+-------
+
+Discovery and enumeration of all counters and events, including tracepoints
+are available with the perf tool. Getting a list of available events is a
+simple case of
+
+ $ perf list 2>&1 | grep Tracepoint
+ ext4:ext4_free_inode [Tracepoint event]
+ ext4:ext4_request_inode [Tracepoint event]
+ ext4:ext4_allocate_inode [Tracepoint event]
+ ext4:ext4_write_begin [Tracepoint event]
+ ext4:ext4_ordered_write_end [Tracepoint event]
+ [ .... remaining output snipped .... ]
+
+
+2. Enabling Events
+==================
+
+2.1 System-Wide Event Enabling
+------------------------------
+
+See Documentation/trace/events.txt for a proper description on how events
+can be enabled system-wide. A short example of enabling all events related
+to page allocation would look something like
+
+ $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done
+
+2.2 System-Wide Event Enabling with SystemTap
+---------------------------------------------
+
+In SystemTap, tracepoints are accessible using the kernel.trace() function
+call. The following is an example that reports every 5 seconds what processes
+were allocating the pages.
+
+ global page_allocs
+
+ probe kernel.trace("mm_page_alloc") {
+ page_allocs[execname()]++
+ }
+
+ function print_count() {
+ printf ("%-25s %-s\n", "#Pages Allocated", "Process Name")
+ foreach (proc in page_allocs-)
+ printf("%-25d %s\n", page_allocs[proc], proc)
+ printf ("\n")
+ delete page_allocs
+ }
+
+ probe timer.s(5) {
+ print_count()
+ }
+
+2.3 System-Wide Event Enabling with PCL
+---------------------------------------
+
+By specifying the -a switch and analysing sleep, the system-wide events
+for a duration of time can be examined.
+
+ $ perf stat -a \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ sleep 10
+ Performance counter stats for 'sleep 10':
+
+ 9630 kmem:mm_page_alloc
+ 2143 kmem:mm_page_free_direct
+ 7424 kmem:mm_pagevec_free
+
+ 10.002577764 seconds time elapsed
+
+Similarly, one could execute a shell and exit it as desired to get a report
+at that point.
+
+2.4 Local Event Enabling
+------------------------
+
+Documentation/trace/ftrace.txt describes how to enable events on a per-thread
+basis using set_ftrace_pid.
+
+2.5 Local Event Enablement with PCL
+-----------------------------------
+
+Events can be activate and tracked for the duration of a process on a local
+basis using PCL such as follows.
+
+ $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free ./hackbench 10
+ Time: 0.909
+
+ Performance counter stats for './hackbench 10':
+
+ 17803 kmem:mm_page_alloc
+ 12398 kmem:mm_page_free_direct
+ 4827 kmem:mm_pagevec_free
+
+ 0.973913387 seconds time elapsed
+
+3. Event Filtering
+==================
+
+Documentation/trace/ftrace.txt covers in-depth how to filter events in
+ftrace. Obviously using grep and awk of trace_pipe is an option as well
+as any script reading trace_pipe.
+
+4. Analysing Event Variances with PCL
+=====================================
+
+Any workload can exhibit variances between runs and it can be important
+to know what the standard deviation in. By and large, this is left to the
+performance analyst to do it by hand. In the event that the discrete event
+occurrences are useful to the performance analyst, then perf can be used.
+
+ $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct
+ -e kmem:mm_pagevec_free ./hackbench 10
+ Time: 0.890
+ Time: 0.895
+ Time: 0.915
+ Time: 1.001
+ Time: 0.899
+
+ Performance counter stats for './hackbench 10' (5 runs):
+
+ 16630 kmem:mm_page_alloc ( +- 3.542% )
+ 11486 kmem:mm_page_free_direct ( +- 4.771% )
+ 4730 kmem:mm_pagevec_free ( +- 2.325% )
+
+ 0.982653002 seconds time elapsed ( +- 1.448% )
+
+In the event that some higher-level event is required that depends on some
+aggregation of discrete events, then a script would need to be developed.
+
+Using --repeat, it is also possible to view how events are fluctuating over
+time on a system wide basis using -a and sleep.
+
+ $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ -a --repeat 10 \
+ sleep 1
+ Performance counter stats for 'sleep 1' (10 runs):
+
+ 1066 kmem:mm_page_alloc ( +- 26.148% )
+ 182 kmem:mm_page_free_direct ( +- 5.464% )
+ 890 kmem:mm_pagevec_free ( +- 30.079% )
+
+ 1.002251757 seconds time elapsed ( +- 0.005% )
+
+5. Higher-Level Analysis with Helper Scripts
+============================================
+
+When events are enabled the events that are triggering can be read from
+/sys/kernel/debug/tracing/trace_pipe in human-readable format although binary
+options exist as well. By post-processing the output, further information can
+be gathered on-line as appropriate. Examples of post-processing might include
+
+ o Reading information from /proc for the PID that triggered the event
+ o Deriving a higher-level event from a series of lower-level events.
+ o Calculate latencies between two events
+
+Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example
+script that can read trace_pipe from STDIN or a copy of a trace. When used
+on-line, it can be interrupted once to generate a report without existing
+and twice to exit.
+
+Simplistically, the script just reads STDIN and counts up events but it
+also can do more such as
+
+ o Derive high-level events from many low-level events. If a number of pages
+ are freed to the main allocator from the per-CPU lists, it recognises
+ that as one per-CPU drain even though there is no specific tracepoint
+ for that event
+ o It can aggregate based on PID or individual process number
+ o In the event memory is getting externally fragmented, it reports
+ on whether the fragmentation event was severe or moderate.
+ o When receiving an event about a PID, it can record who the parent was so
+ that if large numbers of events are coming from very short-lived
+ processes, the parent process responsible for creating all the helpers
+ can be identified
+
+6. Lower-Level Analysis with PCL
+================================
+
+There may also be a requirement to identify what functions with a program
+were generating events within the kernel. To begin this sort of analysis, the
+data must be recorded. At the time of writing, this required root
+
+ $ perf record -c 1 \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ ./hackbench 10
+ Time: 0.894
+ [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ]
+
+Note the use of '-c 1' to set the event period to sample. The default sample
+period is quite high to minimise overhead but the information collected can be
+very coarse as a result.
+
+This record outputted a file called perf.data which can be analysed using
+perf report.
+
+ $ perf report
+ # Samples: 30922
+ #
+ # Overhead Command Shared Object
+ # ........ ......... ................................
+ #
+ 87.27% hackbench [vdso]
+ 6.85% hackbench /lib/i686/cmov/libc-2.9.so
+ 2.62% hackbench /lib/ld-2.9.so
+ 1.52% perf [vdso]
+ 1.22% hackbench ./hackbench
+ 0.48% hackbench [kernel]
+ 0.02% perf /lib/i686/cmov/libc-2.9.so
+ 0.01% perf /usr/bin/perf
+ 0.01% perf /lib/ld-2.9.so
+ 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so
+ #
+ # (For more details, try: perf report --sort comm,dso,symbol)
+ #
+
+According to this, the vast majority of events occured triggered on events
+within the VDSO. With simple binaries, this will often be the case so lets
+take a slightly different example. In the course of writing this, it was
+noticed that X was generating an insane amount of page allocations so lets look
+at it
+
+ $ perf record -c 1 -f \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ -p `pidof X`
+
+This was interrupted after a few seconds and
+
+ $ perf report
+ # Samples: 27666
+ #
+ # Overhead Command Shared Object
+ # ........ ....... .......................................
+ #
+ 51.95% Xorg [vdso]
+ 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1
+ 0.09% Xorg /lib/i686/cmov/libc-2.9.so
+ 0.01% Xorg [kernel]
+ #
+ # (For more details, try: perf report --sort comm,dso,symbol)
+ #
+
+So, almost half of the events are occuring in a library. To get an idea which
+symbol.
+
+ $ perf report --sort comm,dso,symbol
+ # Samples: 27666
+ #
+ # Overhead Command Shared Object Symbol
+ # ........ ....... ....................................... ......
+ #
+ 51.95% Xorg [vdso] [.] 0x000000ffffe424
+ 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2
+ 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc
+ 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f
+ 0.01% Xorg [kernel] [k] read_hpet
+ 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path
+ 0.00% Xorg [kernel] [k] ftrace_trace_userstack
+
+To see where within the function pixmanFillsse2 things are going wrong
+
+ $ perf annotate pixmanFillsse2
+ [ ... ]
+ 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax)
+ : }
+ :
+ : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _
+ : _mm_store_si128 (__m128i *__P, __m128i __B) : {
+ : *__P = __B;
+ 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax)
+ 0.00 : 34ef5: ff
+ 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax)
+ 0.00 : 34efd: ff
+ 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax)
+ 0.00 : 34f05: ff
+ 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax)
+ 0.00 : 34f0d: ff
+ 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax)
+ 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax)
+ 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax)
+ 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax)
+
+At a glance, it looks like the time is being spent copying pixmaps to
+the card. Further investigation would be needed to determine why pixmaps
+are being copied around so much but a starting point would be to take an
+ancient build of libpixmap out of the library path where it was totally
+forgotten about from months ago!