From 2b022e3d4bf9885f781221c59d86283a2cdfc2ed Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 10 Aug 2009 10:48:59 +0800 Subject: timers: Add tracepoints for timer_list timers Add tracepoints which cover the timer life cycle. The tracepoints are integrated with the already existing debug_object debug points as far as possible. Based on patches from Mathieu: http://marc.info/?l=linux-kernel&m=123791201816247&w=2 and Anton: http://marc.info/?l=linux-kernel&m=124331396919301&w=2 [ tglx: Fixed timeout value in timer_start tracepoint, massaged comments and made the printk's more readable ] Signed-off-by: Xiao Guangrong Cc: Anton Blanchard Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: KOSAKI Motohiro Cc: Zhaolei LKML-Reference: <4A7F8A9B.3040201@cn.fujitsu.com> Signed-off-by: Thomas Gleixner --- include/trace/events/timer.h | 137 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 include/trace/events/timer.h (limited to 'include/trace/events') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h new file mode 100644 index 000000000000..725892a93b49 --- /dev/null +++ b/include/trace/events/timer.h @@ -0,0 +1,137 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM timer + +#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TIMER_H + +#include +#include + +/** + * timer_init - called when the timer is initialized + * @timer: pointer to struct timer_list + */ +TRACE_EVENT(timer_init, + + TP_PROTO(struct timer_list *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + ), + + TP_fast_assign( + __entry->timer = timer; + ), + + TP_printk("timer %p", __entry->timer) +); + +/** + * timer_start - called when the timer is started + * @timer: pointer to struct timer_list + * @expires: the timers expiry time + */ +TRACE_EVENT(timer_start, + + TP_PROTO(struct timer_list *timer, unsigned long expires), + + TP_ARGS(timer, expires), + + TP_STRUCT__entry( + __field( void *, timer ) + __field( void *, function ) + __field( unsigned long, expires ) + __field( unsigned long, now ) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->function = timer->function; + __entry->expires = expires; + __entry->now = jiffies; + ), + + TP_printk("timer %p: func %pf, expires %lu, timeout %ld", + __entry->timer, __entry->function, __entry->expires, + (long)__entry->expires - __entry->now) +); + +/** + * timer_expire_entry - called immediately before the timer callback + * @timer: pointer to struct timer_list + * + * Allows to determine the timer latency. + */ +TRACE_EVENT(timer_expire_entry, + + TP_PROTO(struct timer_list *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + __field( unsigned long, now ) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->now = jiffies; + ), + + TP_printk("timer %p: now %lu", __entry->timer, __entry->now) +); + +/** + * timer_expire_exit - called immediately after the timer callback returns + * @timer: pointer to struct timer_list + * + * When used in combination with the timer_expire_entry tracepoint we can + * determine the runtime of the timer callback function. + * + * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might + * be invalid. We solely track the pointer. + */ +TRACE_EVENT(timer_expire_exit, + + TP_PROTO(struct timer_list *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field(void *, timer ) + ), + + TP_fast_assign( + __entry->timer = timer; + ), + + TP_printk("timer %p", __entry->timer) +); + +/** + * timer_cancel - called when the timer is canceled + * @timer: pointer to struct timer_list + */ +TRACE_EVENT(timer_cancel, + + TP_PROTO(struct timer_list *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + ), + + TP_fast_assign( + __entry->timer = timer; + ), + + TP_printk("timer %p", __entry->timer) +); + +#endif /* _TRACE_TIMER_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From c6a2a1770245f654f35f60e1458d4356680f9519 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 10 Aug 2009 10:51:23 +0800 Subject: hrtimer: Add tracepoint for hrtimers Add tracepoints which cover the life cycle of a hrtimer. The tracepoints are integrated with the already existing debug_object debug points as far as possible. [ tglx: Fixed comments, made output conistent, easier to read and parse. Fixed output for 32bit archs which do not use the scalar representation of ktime_t. Hand current time to trace_hrtimer_expiry_entry instead of calling get_time() inside of the trace assignment. ] Signed-off-by: Xiao Guangrong Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Anton Blanchard Cc: Peter Zijlstra Cc: KOSAKI Motohiro Cc: Zhaolei LKML-Reference: <4A7F8B2B.5020908@cn.fujitsu.com> Signed-off-by: Thomas Gleixner --- include/trace/events/timer.h | 139 +++++++++++++++++++++++++++++++++++++++++++ kernel/hrtimer.c | 40 ++++++++++--- 2 files changed, 171 insertions(+), 8 deletions(-) (limited to 'include/trace/events') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 725892a93b49..df3c07fa0cb8 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -5,6 +5,7 @@ #define _TRACE_TIMER_H #include +#include #include /** @@ -131,6 +132,144 @@ TRACE_EVENT(timer_cancel, TP_printk("timer %p", __entry->timer) ); +/** + * hrtimer_init - called when the hrtimer is initialized + * @timer: pointer to struct hrtimer + * @clockid: the hrtimers clock + * @mode: the hrtimers mode + */ +TRACE_EVENT(hrtimer_init, + + TP_PROTO(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode), + + TP_ARGS(timer, clockid, mode), + + TP_STRUCT__entry( + __field( void *, timer ) + __field( clockid_t, clockid ) + __field( enum hrtimer_mode, mode ) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->clockid = clockid; + __entry->mode = mode; + ), + + TP_printk("hrtimer %p, clockid %s, mode %s", __entry->timer, + __entry->clockid == CLOCK_REALTIME ? + "CLOCK_REALTIME" : "CLOCK_MONOTONIC", + __entry->mode == HRTIMER_MODE_ABS ? + "HRTIMER_MODE_ABS" : "HRTIMER_MODE_REL") +); + +/** + * hrtimer_start - called when the hrtimer is started + * @timer: pointer to struct hrtimer + */ +TRACE_EVENT(hrtimer_start, + + TP_PROTO(struct hrtimer *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + __field( void *, function ) + __field( s64, expires ) + __field( s64, softexpires ) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->function = timer->function; + __entry->expires = hrtimer_get_expires(timer).tv64; + __entry->softexpires = hrtimer_get_softexpires(timer).tv64; + ), + + TP_printk("hrtimer %p, func %pf, expires %llu, softexpires %llu", + __entry->timer, __entry->function, + (unsigned long long)ktime_to_ns((ktime_t) { + .tv64 = __entry->expires }), + (unsigned long long)ktime_to_ns((ktime_t) { + .tv64 = __entry->softexpires })) +); + +/** + * htimmer_expire_entry - called immediately before the hrtimer callback + * @timer: pointer to struct hrtimer + * @now: pointer to variable which contains current time of the + * timers base. + * + * Allows to determine the timer latency. + */ +TRACE_EVENT(hrtimer_expire_entry, + + TP_PROTO(struct hrtimer *timer, ktime_t *now), + + TP_ARGS(timer, now), + + TP_STRUCT__entry( + __field( void *, timer ) + __field( s64, now ) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->now = now->tv64; + ), + + TP_printk("hrtimer %p, now %llu", __entry->timer, + (unsigned long long)ktime_to_ns((ktime_t) { + .tv64 = __entry->now })) + ); + +/** + * hrtimer_expire_exit - called immediately after the hrtimer callback returns + * @timer: pointer to struct hrtimer + * + * When used in combination with the hrtimer_expire_entry tracepoint we can + * determine the runtime of the callback function. + */ +TRACE_EVENT(hrtimer_expire_exit, + + TP_PROTO(struct hrtimer *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + ), + + TP_fast_assign( + __entry->timer = timer; + ), + + TP_printk("hrtimer %p", __entry->timer) +); + +/** + * hrtimer_cancel - called when the hrtimer is canceled + * @timer: pointer to struct hrtimer + */ +TRACE_EVENT(hrtimer_cancel, + + TP_PROTO(struct hrtimer *timer), + + TP_ARGS(timer), + + TP_STRUCT__entry( + __field( void *, timer ) + ), + + TP_fast_assign( + __entry->timer = timer; + ), + + TP_printk("hrtimer %p", __entry->timer) +); + #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e2f91ecc01a8..b44d1b07377b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,8 @@ #include +#include + /* * The timer bases: * @@ -441,6 +443,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -797,7 +819,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -883,7 +905,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1116,7 +1138,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1140,7 +1162,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1149,7 +1171,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1160,7 +1182,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1271,7 +1295,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1393,7 +1417,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1569,7 +1593,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the -- cgit v1.2.3 From 3f0a525ebf4b8ef041a332bbe4a73aee94bb064b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 10 Aug 2009 10:52:30 +0800 Subject: itimers: Add tracepoints for itimer Add tracepoints for all itimer variants: ITIMER_REAL, ITIMER_VIRTUAL and ITIMER_PROF. [ tglx: Fixed comments and made the output more readable, parseable and consistent. Replaced pid_vnr by pid_nr because the hrtimer callback can happen in any namespace ] Signed-off-by: Xiao Guangrong Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Anton Blanchard Cc: Peter Zijlstra Cc: KOSAKI Motohiro Cc: Zhaolei LKML-Reference: <4A7F8B6E.2010109@cn.fujitsu.com> Signed-off-by: Thomas Gleixner --- include/trace/events/timer.h | 66 ++++++++++++++++++++++++++++++++++++++++++++ kernel/itimer.c | 5 ++++ kernel/posix-cpu-timers.c | 7 ++++- 3 files changed, 77 insertions(+), 1 deletion(-) (limited to 'include/trace/events') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index df3c07fa0cb8..1844c48d640e 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -270,6 +270,72 @@ TRACE_EVENT(hrtimer_cancel, TP_printk("hrtimer %p", __entry->timer) ); +/** + * itimer_state - called when itimer is started or canceled + * @which: name of the interval timer + * @value: the itimers value, itimer is canceled if value->it_value is + * zero, otherwise it is started + * @expires: the itimers expiry time + */ +TRACE_EVENT(itimer_state, + + TP_PROTO(int which, const struct itimerval *const value, + cputime_t expires), + + TP_ARGS(which, value, expires), + + TP_STRUCT__entry( + __field( int, which ) + __field( cputime_t, expires ) + __field( long, value_sec ) + __field( long, value_usec ) + __field( long, interval_sec ) + __field( long, interval_usec ) + ), + + TP_fast_assign( + __entry->which = which; + __entry->expires = expires; + __entry->value_sec = value->it_value.tv_sec; + __entry->value_usec = value->it_value.tv_usec; + __entry->interval_sec = value->it_interval.tv_sec; + __entry->interval_usec = value->it_interval.tv_usec; + ), + + TP_printk("which %d, expires %lu, it_value %lu.%lu, it_interval %lu.%lu", + __entry->which, __entry->expires, + __entry->value_sec, __entry->value_usec, + __entry->interval_sec, __entry->interval_usec) +); + +/** + * itimer_expire - called when itimer expires + * @which: type of the interval timer + * @pid: pid of the process which owns the timer + * @now: current time, used to calculate the latency of itimer + */ +TRACE_EVENT(itimer_expire, + + TP_PROTO(int which, struct pid *pid, cputime_t now), + + TP_ARGS(which, pid, now), + + TP_STRUCT__entry( + __field( int , which ) + __field( pid_t, pid ) + __field( cputime_t, now ) + ), + + TP_fast_assign( + __entry->which = which; + __entry->now = now; + __entry->pid = pid_nr(pid); + ), + + TP_printk("which %d, pid %d, now %lu", __entry->which, + (int) __entry->pid, __entry->now) +); + #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ diff --git a/kernel/itimer.c b/kernel/itimer.c index 8078a32d3b10..b03451ede528 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -122,6 +123,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; @@ -166,6 +168,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, } it->expires = nval; it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); spin_unlock_irq(&tsk->sighand->siglock); @@ -217,6 +221,7 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 12161f74744e..5c9dc228747b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. @@ -1090,9 +1091,13 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_one_jiffy); it->error -= onecputick; } - } else + } else { it->expires = cputime_zero; + } + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } -- cgit v1.2.3 From 98a56ab382079f777e261e14512cbd4fb2107af4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2009 08:48:28 -0400 Subject: ext4: Fix spelling typo in the trace format for trace_ext4_da_writepages() Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/trace/events') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 8d433c4e3709..15051d2d1219 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -243,7 +243,7 @@ TRACE_EVENT(ext4_da_writepages, __entry->range_cyclic = wbc->range_cyclic; ), - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, -- cgit v1.2.3 From b3a3ca8ca0c3c29abc5b2bfe94bb14f3f4590df9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Aug 2009 23:13:11 -0400 Subject: ext4: Add new tracepoint: trace_ext4_da_write_pages() Add a new tracepoint which shows the pages that will be written using write_cache_pages() by ext4_da_writepages(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 +++++++++++++++ fs/ext4/inode.c | 13 +------------ include/trace/events/ext4.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 12 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 41a76e163b99..81014f4ed22d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -113,6 +113,21 @@ struct ext4_allocation_request { unsigned int flags; }; +/* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + /* * Special inodes numbers */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff659e757578..17802a96af9f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1875,18 +1875,6 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - /* * mpage_da_submit_io - walks through extent of pages and try to write * them with writepage() call back @@ -2863,6 +2851,7 @@ retry: mpd.io_done = 1; ret = MPAGE_DA_EXTENT_TAIL; } + trace_ext4_da_write_pages(inode, &mpd); wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 15051d2d1219..dd43399288ea 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -251,6 +251,40 @@ TRACE_EVENT(ext4_da_writepages, __entry->range_cyclic) ); +TRACE_EVENT(ext4_da_write_pages, + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), + + TP_ARGS(inode, mpd), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, b_blocknr ) + __field( __u32, b_size ) + __field( __u32, b_state ) + __field( unsigned long, first_page ) + __field( int, io_done ) + __field( int, pages_written ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->b_blocknr = mpd->b_blocknr; + __entry->b_size = mpd->b_size; + __entry->b_state = mpd->b_state; + __entry->first_page = mpd->first_page; + __entry->io_done = mpd->io_done; + __entry->pages_written = mpd->pages_written; + ), + + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", + jbd2_dev_to_name(__entry->dev), __entry->ino, + __entry->b_blocknr, __entry->b_size, + __entry->b_state, __entry->first_page, + __entry->io_done, __entry->pages_written) +); + TRACE_EVENT(ext4_da_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), -- cgit v1.2.3 From a3710fd1ee8cd542c5de63cf2c39f8912031f867 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2009 08:50:18 -0400 Subject: ext4: fix tracepoint format string warnings Unlike on some other architectures ino_t is an unsigned int on s390. So add an explicit cast to avoid lots of compile warnings: In file included from include/trace/ftrace.h:285, from include/trace/define_trace.h:61, from include/trace/events/ext4.h:711, from fs/ext4/super.c:50: include/trace/events/ext4.h: In function 'ftrace_raw_output_ext4_free_inode': include/trace/events/ext4.h:12: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' Signed-off-by: Heiko Carstens Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 75 ++++++++++++++++++++++++--------------------- include/trace/events/jbd2.h | 2 +- 2 files changed, 41 insertions(+), 36 deletions(-) (limited to 'include/trace/events') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index dd43399288ea..68b53c7ef8a7 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -33,8 +33,8 @@ TRACE_EVENT(ext4_free_inode, ), TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, - __entry->uid, __entry->gid, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) ); @@ -56,7 +56,8 @@ TRACE_EVENT(ext4_request_inode, ), TP_printk("dev %s dir %lu mode %d", - jbd2_dev_to_name(__entry->dev), __entry->dir, __entry->mode) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, + __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, @@ -79,7 +80,8 @@ TRACE_EVENT(ext4_allocate_inode, ), TP_printk("dev %s ino %lu dir %lu mode %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->dir, __entry->mode) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_write_begin, @@ -106,8 +108,8 @@ TRACE_EVENT(ext4_write_begin, ), TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->flags) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->flags) ); TRACE_EVENT(ext4_ordered_write_end, @@ -133,8 +135,8 @@ TRACE_EVENT(ext4_ordered_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_writeback_write_end, @@ -160,8 +162,8 @@ TRACE_EVENT(ext4_writeback_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_journalled_write_end, @@ -186,8 +188,8 @@ TRACE_EVENT(ext4_journalled_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_writepage, @@ -209,7 +211,8 @@ TRACE_EVENT(ext4_writepage, ), TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->index) ); TRACE_EVENT(ext4_da_writepages, @@ -244,7 +247,8 @@ TRACE_EVENT(ext4_da_writepages, ), TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, @@ -279,7 +283,7 @@ TRACE_EVENT(ext4_da_write_pages, ), TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->b_blocknr, __entry->b_size, __entry->b_state, __entry->first_page, __entry->io_done, __entry->pages_written) @@ -314,7 +318,8 @@ TRACE_EVENT(ext4_da_writepages_result, ), TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->encountered_congestion, __entry->more_io, __entry->no_nrwrite_index_update) @@ -343,8 +348,8 @@ TRACE_EVENT(ext4_da_write_begin, ), TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->flags) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->flags) ); TRACE_EVENT(ext4_da_write_end, @@ -370,8 +375,8 @@ TRACE_EVENT(ext4_da_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_discard_blocks, @@ -421,8 +426,8 @@ TRACE_EVENT(ext4_mb_new_inode_pa, ), TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, - __entry->pa_len, __entry->pa_lstart) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); TRACE_EVENT(ext4_mb_new_group_pa, @@ -449,8 +454,8 @@ TRACE_EVENT(ext4_mb_new_group_pa, ), TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, - __entry->pa_len, __entry->pa_lstart) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); TRACE_EVENT(ext4_mb_release_inode_pa, @@ -476,8 +481,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_printk("dev %s ino %lu block %llu count %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, - __entry->count) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, @@ -522,7 +527,7 @@ TRACE_EVENT(ext4_discard_preallocations, ), TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mb_discard_preallocations, @@ -577,8 +582,8 @@ TRACE_EVENT(ext4_request_blocks, ), TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, - __entry->len, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->flags, __entry->len, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -621,8 +626,8 @@ TRACE_EVENT(ext4_allocate_blocks, ), TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, - __entry->len, __entry->block, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->flags, __entry->len, __entry->block, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -655,8 +660,8 @@ TRACE_EVENT(ext4_free_blocks, ), TP_printk("dev %s ino %lu block %llu count %lu metadata %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, - __entry->count, __entry->metadata) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->block, __entry->count, __entry->metadata) ); TRACE_EVENT(ext4_sync_file, @@ -679,8 +684,8 @@ TRACE_EVENT(ext4_sync_file, ), TP_printk("dev %s ino %ld parent %ld datasync %d ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->parent, - __entry->datasync) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_fs, diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 10813fa0c8d0..b851f0b4701c 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -159,7 +159,7 @@ TRACE_EVENT(jbd2_submit_inode_data, ), TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From f977bb4937857994312fff4f9c2cad336a36a932 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 18:15:54 +0200 Subject: perf_counter, sched: Add sched_stat_runtime tracepoint This allows more precise tracking of how the scheduler accounts (and acts upon) a task having spent N nanoseconds of CPU time. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 33 +++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 1 + 2 files changed, 34 insertions(+) (limited to 'include/trace/events') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b48f1ad7c946..4069c43f4187 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -379,6 +379,39 @@ TRACE_EVENT(sched_stat_wait, (unsigned long long)__entry->delay) ); +/* + * Tracepoint for accounting runtime (time the task is executing + * on a CPU). + */ +TRACE_EVENT(sched_stat_runtime, + + TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), + + TP_ARGS(tsk, runtime, vruntime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, runtime ) + __field( u64, vruntime ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->runtime = runtime; + __entry->vruntime = vruntime; + ) + TP_perf_assign( + __perf_count(runtime); + ), + + TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->runtime, + (unsigned long long)__entry->vruntime) +); + /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f84121016..a097e909e80f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } -- cgit v1.2.3 From 3661d28615ea580c1db02a972fd4d3898df1cb01 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 14 Sep 2009 22:59:50 -0400 Subject: ext4: Fix include/trace/events/ext4.h to work with Systemtap Using relative pathnames in #include statements interacts badly with SystemTap, since the fs/ext4/*.h header files are not packaged up as part of a distribution kernel's header files. Since systemtap doesn't use TP_fast_assign(), we can use a blind structure definition and then make sure the needed header files are defined before the ext4 source files #include the trace/events/ext4.h header file. https://bugzilla.redhat.com/show_bug.cgi?id=512478 Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + include/trace/events/ext4.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04c693357336..af95dd8ba54b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "mballoc.h" #define CREATE_TRACE_POINTS #include diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 68b53c7ef8a7..6fe6ce9ee071 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -5,10 +5,12 @@ #define _TRACE_EXT4_H #include -#include "../../../fs/ext4/ext4.h" -#include "../../../fs/ext4/mballoc.h" #include +struct ext4_allocation_context; +struct ext4_allocation_request; +struct ext4_prealloc_space; + TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), -- cgit v1.2.3 From fb40ba0d98968bc3454731360363d725b4f1064c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 16 Sep 2009 19:30:40 -0400 Subject: ext4: Add a tracepoint for ext4_alloc_da_blocks() Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 ++ include/trace/events/ext4.h | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/trace/events') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5b4ce40cc66..9887a0c562d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3128,6 +3128,8 @@ out: */ int ext4_alloc_da_blocks(struct inode *inode) { + trace_ext4_alloc_da_blocks(inode); + if (!EXT4_I(inode)->i_reserved_data_blocks && !EXT4_I(inode)->i_reserved_meta_blocks) return 0; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6fe6ce9ee071..c1bd8f1e8b94 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -10,6 +10,9 @@ struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_prealloc_space; +struct ext4_inode_info; + +#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), @@ -710,6 +713,30 @@ TRACE_EVENT(ext4_sync_fs, __entry->wait) ); +TRACE_EVENT(ext4_alloc_da_blocks, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned int, data_blocks ) + __field( unsigned int, meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + ), + + TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->data_blocks, __entry->meta_blocks) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 6161352142d5fed4cd753b32e5ccde66e705b14e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 17 Sep 2009 16:11:28 +0200 Subject: tracing, perf: Convert the power tracer into an event tracer This patch converts the existing power tracer into an event tracer, so that power events (C states and frequency changes) can be tracked via "perf". This also removes the perl script that was used to demo the tracer; its functionality is being replaced entirely with timechart. Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: <20090912130542.6d314860@infradead.org> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 7 +- arch/x86/kernel/process.c | 28 ++-- include/trace/events/power.h | 81 +++++++++++ kernel/trace/Makefile | 2 +- kernel/trace/power-traces.c | 20 +++ kernel/trace/trace.h | 3 - kernel/trace/trace_entries.h | 17 --- kernel/trace/trace_power.c | 218 ----------------------------- scripts/tracing/power.pl | 108 -------------- 10 files changed, 113 insertions(+), 388 deletions(-) delete mode 100644 Documentation/trace/power.txt create mode 100644 include/trace/events/power.h create mode 100644 kernel/trace/power-traces.c delete mode 100644 kernel/trace/trace_power.c delete mode 100644 scripts/tracing/power.pl (limited to 'include/trace/events') diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt deleted file mode 100644 index cd805e16dc27..000000000000 --- a/Documentation/trace/power.txt +++ /dev/null @@ -1,17 +0,0 @@ -The power tracer collects detailed information about C-state and P-state -transitions, instead of just looking at the high-level "average" -information. - -There is a helper script found in scrips/tracing/power.pl in the kernel -sources which can be used to parse this information and create a -Scalable Vector Graphics (SVG) picture from the trace data. - -To use this tracer: - - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - echo power > /sys/kernel/debug/tracing/current_tracer - echo 1 > /sys/kernel/debug/tracing/tracing_enabled - sleep 1 - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - cat /sys/kernel/debug/tracing/trace | \ - perl scripts/tracing/power.pl > out.sv diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4109679863c1..479cc8c418c1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include @@ -72,8 +72,6 @@ static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); static DEFINE_PER_CPU(struct aperfmperf, old_perf); -DEFINE_TRACE(power_mark); - /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -332,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -364,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba83..7b60e3906889 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -299,9 +296,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +309,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); + trace_power_end(0); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +367,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +377,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); + trace_power_end(0); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +394,7 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); + trace_power_end(0); } else local_irq_enable(); } @@ -414,13 +406,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* diff --git a/include/trace/events/power.h b/include/trace/events/power.h new file mode 100644 index 000000000000..ea6d579261ad --- /dev/null +++ b/include/trace/events/power.h @@ -0,0 +1,81 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM power + +#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_POWER_H + +#include +#include + +#ifndef _TRACE_POWER_ENUM_ +#define _TRACE_POWER_ENUM_ +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; +#endif + + + +TRACE_EVENT(power_start, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state) +); + +TRACE_EVENT(power_end, + + TP_PROTO(int dummy), + + TP_ARGS(dummy), + + TP_STRUCT__entry( + __field( u64, dummy ) + ), + + TP_fast_assign( + __entry->dummy = 0xffff; + ), + + TP_printk("dummy=%lu", (unsigned long)__entry->dummy) + +); + + +TRACE_EVENT(power_frequency, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state) +); + +#endif /* _TRACE_POWER_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90a..26f03ac07c2b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 000000000000..e06c6e3d56a3 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven + */ + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86bcff94791a..405cb850b75d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -37,7 +36,6 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, @@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index a431748ddd6e..ead3d724599d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry, F_printk("from: %llx to: %llx", __entry->from, __entry->to) ); -FTRACE_ENTRY(power, trace_power, - - TRACE_POWER, - - F_STRUCT( - __field_struct( struct power_trace, state_data ) - __field_desc( s64, state_data, stamp ) - __field_desc( s64, state_data, end ) - __field_desc( int, state_data, type ) - __field_desc( int, state_data, state ) - ), - - F_printk("%llx->%llx type:%u state:%u", - __entry->stamp, __entry->end, - __entry->type, __entry->state) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f1445a..000000000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker - * - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/scripts/tracing/power.pl b/scripts/tracing/power.pl deleted file mode 100644 index 4f729b3501e0..000000000000 --- a/scripts/tracing/power.pl +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2008, Intel Corporation -# -# This file is part of the Linux kernel -# -# This program file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation; version 2 of the License. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program in a file named COPYING; if not, write to the -# Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, -# Boston, MA 02110-1301 USA -# -# Authors: -# Arjan van de Ven - - -# -# This script turns a cstate ftrace output into a SVG graphic that shows -# historic C-state information -# -# -# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg -# - -my @styles; -my $base = 0; - -my @pstate_last; -my @pstate_level; - -$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; - - -print " \n"; -print "\n"; - -my $scale = 30000.0; -while (<>) { - my $line = $_; - if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) { - if ($base == 0) { - $base = $1; - } - my $time = $1 - $base; - $time = $time * $scale; - my $C = $2; - my $cpu = $3; - my $y = 400 * $cpu; - my $duration = $4 * $scale; - my $msec = int($4 * 100000)/100.0; - my $height = $C * 20; - $style = $styles[$C]; - - $y = $y + 140 - $height; - - $x2 = $time + 4; - $y2 = $y + 4; - - - print "\n"; - print "C$C $msec\n"; - } - if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) { - my $time = $1 - $base; - my $state = $2; - my $cpu = $3; - - if (defined($pstate_last[$cpu])) { - my $from = $pstate_last[$cpu]; - my $oldstate = $pstate_state[$cpu]; - my $duration = ($time-$from) * $scale; - - $from = $from * $scale; - my $to = $from + $duration; - my $height = 140 - ($oldstate * (140/8)); - - my $y = 400 * $cpu + 200 + $height; - my $y2 = $y+4; - my $style = $styles[8]; - - print "\n"; - print "P$oldstate (cpu $cpu)\n"; - }; - - $pstate_last[$cpu] = $time; - $pstate_state[$cpu] = $state; - } -} - - -print "\n"; -- cgit v1.2.3 From 4b4f278c030aa4b6ee0915f396e9a9478d92d610 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:41 -0700 Subject: tracing, page-allocator: add trace events for page allocation and page freeing This patch adds trace events for the allocation and freeing of pages, including the freeing of pagevecs. Using the events, it will be known what struct page and pfns are being allocated and freed and what the call site was in many cases. The page alloc tracepoints be used as an indicator as to whether the workload was heavily dependant on the page allocator or not. You can make a guess based on vmstat but you can't get a per-process breakdown. Depending on the call path, the call_site for page allocation may be __get_free_pages() instead of a useful callsite. Instead of passing down a return address similar to slab debugging, the user should enable the stacktrace and seg-addr options to get a proper stack trace. The pagevec free tracepoint has a different usecase. It can be used to get a idea of how many pages are being dumped off the LRU and whether it is kswapd doing the work or a process doing direct reclaim. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 74 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 7 ++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'include/trace/events') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 1493c541f9c4..0d358a0d45c1 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -225,6 +225,80 @@ TRACE_EVENT(kmem_cache_free, TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) ); + +TRACE_EVENT(mm_page_free_direct, + + TP_PROTO(struct page *page, unsigned int order), + + TP_ARGS(page, order), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + ), + + TP_printk("page=%p pfn=%lu order=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order) +); + +TRACE_EVENT(mm_pagevec_free, + + TP_PROTO(struct page *page, int cold), + + TP_ARGS(page, cold), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, cold ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->cold = cold; + ), + + TP_printk("page=%p pfn=%lu order=0 cold=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->cold) +); + +TRACE_EVENT(mm_page_alloc, + + TP_PROTO(struct page *page, unsigned int order, + gfp_t gfp_flags, int migratetype), + + TP_ARGS(page, order, gfp_flags, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + __field( gfp_t, gfp_flags ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->gfp_flags = gfp_flags; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype, + show_gfp_flags(__entry->gfp_flags)) +); + #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 913a8ebd3a8e..80f954d82d77 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1076,6 +1076,7 @@ static void free_hot_cold_page(struct page *page, int cold) void free_hot_page(struct page *page) { + trace_mm_page_free_direct(page, 0); free_hot_cold_page(page, 0); } @@ -1920,6 +1921,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1954,13 +1956,16 @@ void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { + trace_mm_page_free_direct(page, order); if (order == 0) free_hot_page(page); else -- cgit v1.2.3 From e0fff1bd12469c45dab088e353d8882761387bb6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:42 -0700 Subject: tracing, page-allocator: add trace events for anti-fragmentation falling back to other migratetypes Fragmentation avoidance depends on being able to use free pages from lists of the appropriate migrate type. In the event this is not possible, __rmqueue_fallback() selects a different list and in some circumstances change the migratetype of the pageblock. Simplistically, the more times this event occurs, the more likely that fragmentation will be a problem later for hugepage allocation at least but there are other considerations such as the order of page being split to satisfy the allocation. This patch adds a trace event for __rmqueue_fallback() that reports what page is being used for the fallback, the orders of relevant pages, the desired migratetype and the migratetype of the lists being used, whether the pageblock changed type and whether this event is important with respect to fragmentation avoidance or not. This information can be used to help analyse fragmentation avoidance and help decide whether min_free_kbytes should be increased or not. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 38 ++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 4 ++++ 2 files changed, 42 insertions(+) (limited to 'include/trace/events') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 0d358a0d45c1..aae16ee17601 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -299,6 +299,44 @@ TRACE_EVENT(mm_page_alloc, show_gfp_flags(__entry->gfp_flags)) ); +TRACE_EVENT(mm_page_alloc_extfrag, + + TP_PROTO(struct page *page, + int alloc_order, int fallback_order, + int alloc_migratetype, int fallback_migratetype), + + TP_ARGS(page, + alloc_order, fallback_order, + alloc_migratetype, fallback_migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, alloc_order ) + __field( int, fallback_order ) + __field( int, alloc_migratetype ) + __field( int, fallback_migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->alloc_order = alloc_order; + __entry->fallback_order = fallback_order; + __entry->alloc_migratetype = alloc_migratetype; + __entry->fallback_migratetype = fallback_migratetype; + ), + + TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->alloc_order, + __entry->fallback_order, + pageblock_order, + __entry->alloc_migratetype, + __entry->fallback_migratetype, + __entry->fallback_order < pageblock_order, + __entry->alloc_migratetype == __entry->fallback_migratetype) +); + #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80f954d82d77..77f517c18b37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -853,6 +853,10 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } -- cgit v1.2.3 From 0d3d062a6e289e065bd0aa537a6806a1806bf8aa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:44 -0700 Subject: tracing, page-allocator: add trace event for page traffic related to the buddy lists The page allocation trace event reports that a page was successfully allocated but it does not specify where it came from. When analysing performance, it can be important to distinguish between pages coming from the per-cpu allocator and pages coming from the buddy lists as the latter requires the zone lock to the taken and more data structures to be examined. This patch adds a trace event for __rmqueue reporting when a page is being allocated from the buddy lists. It distinguishes between being called to refill the per-cpu lists or whether it is a high-order allocation. Similarly, this patch adds an event to catch when the PCP lists are being drained a little and pages are going back to the buddy lists. This is trickier to draw conclusions from but high activity on those events could explain why there were a large number of cache misses on a page-allocator-intensive workload. The coalescing and splitting of buddies involves a lot of writing of page metadata and cache line bounces not to mention the acquisition of an interrupt-safe lock necessary to enter this path. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 51 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 3 +++ 2 files changed, 54 insertions(+) (limited to 'include/trace/events') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index aae16ee17601..eaf46bdd18a5 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -299,6 +299,57 @@ TRACE_EVENT(mm_page_alloc, show_gfp_flags(__entry->gfp_flags)) ); +TRACE_EVENT(mm_page_alloc_zone_locked, + + TP_PROTO(struct page *page, unsigned int order, int migratetype), + + TP_ARGS(page, order, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype, + __entry->order == 0) +); + +TRACE_EVENT(mm_page_pcpu_drain, + + TP_PROTO(struct page *page, int order, int migratetype), + + TP_ARGS(page, order, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, order ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%d migratetype=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype) +); + TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77f517c18b37..4c847cc57caf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -535,6 +536,7 @@ static void free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); + trace_mm_page_pcpu_drain(page, order, page_private(page)); __free_one_page(page, zone, order, page_private(page)); } spin_unlock(&zone->lock); @@ -890,6 +892,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } -- cgit v1.2.3