summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/cgroup.c39
-rw-r--r--kernel/cpu.c139
-rw-r--r--kernel/cpuset.c101
-rw-r--r--kernel/cred.c85
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c985
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1095
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c564
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2956
-rw-r--r--kernel/debug/kdb/kdb_private.h298
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/early_res.c6
-rw-r--r--kernel/exec_domain.c38
-rw-r--r--kernel/exit.c42
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/hrtimer.c19
-rw-r--r--kernel/hw_breakpoint.c90
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/kallsyms.c21
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kgdb.c1764
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c7
-rw-r--r--kernel/module.c1360
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/padata.c882
-rw-r--r--kernel/panic.c27
-rw-r--r--kernel/perf_event.c1057
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pm_qos_params.c215
-rw-r--r--kernel/posix-cpu-timers.c48
-rw-r--r--kernel/posix-timers.c22
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)24
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c19
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/printk.c68
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c26
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/resource.c16
-rw-r--r--kernel/sched.c590
-rw-r--r--kernel/sched_clock.c96
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c12
-rw-r--r--kernel/sched_fair.c556
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c152
-rw-r--r--kernel/sysctl_binary.c9
-rw-r--r--kernel/time.c24
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c32
-rw-r--r--kernel/time/timekeeping.c79
-rw-r--r--kernel/timer.c75
-rw-r--r--kernel/trace/Kconfig83
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c140
-rw-r--r--kernel/trace/ftrace.c12
-rw-r--r--kernel/trace/kmemtrace.c511
-rw-r--r--kernel/trace/ring_buffer.c59
-rw-r--r--kernel/trace/trace.c253
-rw-r--r--kernel/trace/trace.h116
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c181
-rw-r--r--kernel/trace/trace_events.c338
-rw-r--r--kernel/trace/trace_events_filter.c45
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c16
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c488
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c180
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c20
-rw-r--r--kernel/trace/trace_sched_wakeup.c35
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c147
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user_namespace.c48
-rw-r--r--kernel/watchdog.c567
-rw-r--r--kernel/workqueue.c3176
-rw-r--r--kernel/workqueue_sched.h9
134 files changed, 17927 insertions, 9498 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 149e18ef1ab1..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(file->f_path.dentry, &sbuf))
+ if (vfs_statfs(&file->f_path, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
-#include <linux/bug.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
-#define MAX_THREADS 256
#define MAX_WORK 32768
static LIST_HEAD(async_pending);
static LIST_HEAD(async_running);
static DEFINE_SPINLOCK(async_lock);
-static int async_enabled = 0;
-
struct async_entry {
- struct list_head list;
- async_cookie_t cookie;
- async_func_ptr *func;
- void *data;
- struct list_head *running;
+ struct list_head list;
+ struct work_struct work;
+ async_cookie_t cookie;
+ async_func_ptr *func;
+ void *data;
+ struct list_head *running;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
-static atomic_t thread_count;
extern int initcall_debug;
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
+
/*
* pick the first pending entry and run it
*/
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
{
+ struct async_entry *entry =
+ container_of(work, struct async_entry, work);
unsigned long flags;
- struct async_entry *entry;
ktime_t calltime, delta, rettime;
- /* 1) pick one task from the pending queue */
-
+ /* 1) move self to the running queue */
spin_lock_irqsave(&async_lock, flags);
- if (list_empty(&async_pending))
- goto out;
- entry = list_first_entry(&async_pending, struct async_entry, list);
-
- /* 2) move it to the running queue */
list_move_tail(&entry->list, entry->running);
spin_unlock_irqrestore(&async_lock, flags);
- /* 3) run it (and print duration)*/
+ /* 2) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 4) remove it from the running queue */
+ /* 3) remove self from the running queue */
spin_lock_irqsave(&async_lock, flags);
list_del(&entry->list);
- /* 5) free the entry */
+ /* 4) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 6) wake up any waiters. */
+ /* 5) wake up any waiters */
wake_up(&async_done);
- return;
-
-out:
- spin_unlock_irqrestore(&async_lock, flags);
}
-
static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
-
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
- if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
ptr(data, newcookie);
return newcookie;
}
+ INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = ptr;
entry->data = data;
entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
list_add_tail(&entry->list, &async_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- wake_up(&async_new);
+
+ /* schedule for execution */
+ queue_work(system_unbound_wq, &entry->work);
+
return newcookie;
}
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
async_synchronize_cookie_domain(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-
-
-static int async_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int ret = HZ;
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * check the list head without lock.. false positives
- * are dealt with inside run_one_entry() while holding
- * the lock.
- */
- rmb();
- if (!list_empty(&async_pending))
- run_one_entry();
- else
- ret = schedule_timeout(HZ);
-
- if (ret == 0) {
- /*
- * we timed out, this means we as thread are redundant.
- * we sign off and die, but we to avoid any races there
- * is a last-straw check to see if work snuck in.
- */
- atomic_dec(&thread_count);
- wmb(); /* manager must see our departure first */
- if (list_empty(&async_pending))
- break;
- /*
- * woops work came in between us timing out and us
- * signing off; we need to stay alive and keep working.
- */
- atomic_inc(&thread_count);
- }
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int async_manager_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int tc, ec;
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- tc = atomic_read(&thread_count);
- rmb();
- ec = atomic_read(&entry_count);
-
- while (tc < ec && tc < MAX_THREADS) {
- if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
- tc))) {
- msleep(100);
- continue;
- }
- atomic_inc(&thread_count);
- tc++;
- }
-
- schedule();
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int __init async_init(void)
-{
- async_enabled =
- !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
-
- WARN_ON(!async_enabled);
- return 0;
-}
-
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
-#include <linux/inotify.h>
#include <linux/freezer.h>
#include <linux/tty.h>
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
audit_hold_skb(skb);
} else
/* drop the extra reference if sent ok */
- kfree_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
/* audit watch functions */
-extern unsigned long audit_watch_inode(struct audit_watch *watch);
-extern dev_t audit_watch_dev(struct audit_watch *watch);
+#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
-extern int audit_add_watch(struct audit_krule *krule);
-extern void audit_remove_watch(struct audit_watch *watch);
-extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
-extern void audit_inotify_unregister(struct list_head *in_list);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern struct list_head *audit_watch_rules(struct audit_watch *watch);
-
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+
+#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
#include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct inotify_watch watch;
+ struct fsnotify_mark mark;
struct list_head trees; /* with root here */
int dead;
int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
* that makes a difference. Some.
*/
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
return tree->pathname;
}
-static struct audit_chunk *alloc_chunk(int count)
-{
- struct audit_chunk *chunk;
- size_t size;
- int i;
-
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- INIT_LIST_HEAD(&chunk->hash);
- INIT_LIST_HEAD(&chunk->trees);
- chunk->count = count;
- atomic_long_set(&chunk->refs, 1);
- for (i = 0; i < count; i++) {
- INIT_LIST_HEAD(&chunk->owners[i].list);
- chunk->owners[i].index = i;
- }
- inotify_init_watch(&chunk->watch);
- return chunk;
-}
-
static void free_chunk(struct audit_chunk *chunk)
{
int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ return chunk;
+}
+
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct list_head *list = chunk_hash(chunk->watch.inode);
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct list_head *list;
+
+ if (!entry->i.inode)
+ return;
+ list = chunk_hash(entry->i.inode);
list_add_rcu(&chunk->hash, list);
}
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (p->watch.inode == inode) {
+ /* mark.inode may have gone NULL, but who cares? */
+ if (p->mark.i.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
+ struct fsnotify_mark *entry = &chunk->mark;
struct audit_chunk *new;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
- if (!pin_inotify_watch(&chunk->watch)) {
- /*
- * Filesystem is shutting down; all watches are getting
- * evicted, just take it off the node list for this
- * tree and let the eviction logics take care of the
- * rest.
- */
- owner = p->owner;
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
- return;
- }
+ fsnotify_get_mark(entry);
spin_unlock(&hash_lock);
- /*
- * pin_inotify_watch() succeeded, so the watch won't go away
- * from under us.
- */
- mutex_lock(&chunk->watch.inode->inotify_mutex);
- if (chunk->dead) {
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_lock(&entry->lock);
+ if (chunk->dead || !entry->i.inode) {
+ spin_unlock(&entry->lock);
goto out;
}
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
goto out;
}
new = alloc_chunk(size);
if (!new)
goto Fallback;
- if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+ fsnotify_duplicate_mark(&new->mark, entry);
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
free_chunk(new);
goto Fallback;
}
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
goto out;
Fallback:
@@ -314,31 +308,33 @@ Fallback:
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_unlock(&entry->lock);
out:
- unpin_inotify_watch(&chunk->watch);
+ fsnotify_put_mark(entry);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
+ struct fsnotify_mark *entry;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
- if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+ entry = &chunk->mark;
+ if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
free_chunk(chunk);
return -ENOSPC;
}
- mutex_lock(&inode->inotify_mutex);
+ spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
return 0;
}
chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
}
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&inode->inotify_mutex);
+ spin_unlock(&entry->lock);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct inotify_watch *watch;
+ struct fsnotify_mark *old_entry, *chunk_entry;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+ old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ if (!old_entry)
return create_chunk(inode, tree);
- old = container_of(watch, struct audit_chunk, watch);
+ old = container_of(old_entry, struct audit_chunk, mark);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return 0;
}
}
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return -ENOMEM;
}
- mutex_lock(&inode->inotify_mutex);
- if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
+ chunk_entry = &chunk->mark;
+
+ spin_lock(&old_entry->lock);
+ if (!old_entry->i.inode) {
+ /* old_entry is being shot, lets just lie */
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(old_entry);
free_chunk(chunk);
+ return -ENOENT;
+ }
+
+ fsnotify_duplicate_mark(chunk_entry, old_entry);
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ spin_unlock(&old_entry->lock);
+ free_chunk(chunk);
+ fsnotify_put_mark(old_entry);
return -ENOSPC;
}
+
+ /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+ spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
+
+ /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+
+ fsnotify_destroy_mark(chunk_entry);
+
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
- inotify_evict_watch(&old->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
- put_inotify_watch(&old->watch); /* and kill it */
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+ fsnotify_destroy_mark(old_entry);
+ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
- struct inode *inode = find_chunk(node)->watch.inode;
+ struct audit_chunk *chunk = find_chunk(node);
+ /* this could be NULL if the watch is dieing else where... */
+ struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
* Here comes the stuff asynchronous to auditctl operations
*/
-/* inode->inotify_mutex is locked */
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
mutex_unlock(&audit_filter_mutex);
}
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmonut_mark,
+ struct fsnotify_event *event)
+{
+ BUG();
+ return -EOPNOTSUPP;
+}
+
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
- if (mask & IN_IGNORED) {
- evict_chunk(chunk);
- put_inotify_watch(watch);
- }
+ evict_chunk(chunk);
+ fsnotify_put_mark(entry);
}
-static void destroy_watch(struct inotify_watch *watch)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ __u32 mask, void *data, int data_type)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- call_rcu(&chunk->head, __put_chunk);
+ return false;
}
-static const struct inotify_operations rtree_inotify_ops = {
- .handle_event = handle_event,
- .destroy_watch = destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+ .handle_event = audit_tree_handle_event,
+ .should_send_event = audit_tree_send_event,
+ .free_group_priv = NULL,
+ .free_event_priv = NULL,
+ .freeing_mark = audit_tree_freeing_mark,
};
static int __init audit_tree_init(void)
{
int i;
- rtree_ih = inotify_init(&rtree_inotify_ops);
- if (IS_ERR(rtree_ih))
- audit_panic("cannot initialize inotify handle for rectree watches");
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..6bf2306be7d6 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/inotify.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
+ struct list_head rules; /* anchor for krule->rlist */
};
struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
+ struct list_head watches; /* anchor for audit_watch->wlist */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
};
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+/* fsnotify handle. */
+struct fsnotify_group *audit_watch_group;
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct audit_parent *parent)
+{
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
-static void audit_free_parent(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
{
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
+ parent = container_of(entry, struct audit_parent, mark);
+ audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode. If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+ struct audit_parent *parent = NULL;
+ struct fsnotify_mark *entry;
+
+ entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ if (entry)
+ parent = container_of(entry, struct audit_parent, mark);
+
+ return parent;
}
void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
+ audit_put_parent(watch->parent);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
return watch->path;
}
-struct list_head *audit_watch_rules(struct audit_watch *watch)
-{
- return &watch->rules;
-}
-
-unsigned long audit_watch_inode(struct audit_watch *watch)
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return watch->ino;
-}
-
-dev_t audit_watch_dev(struct audit_watch *watch)
-{
- return watch->dev;
+ return (watch->ino != (unsigned long)-1) &&
+ (watch->ino == ino) &&
+ (watch->dev == dev);
}
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct nameidata *ndp)
{
+ struct inode *inode = ndp->path.dentry->d_inode;
struct audit_parent *parent;
- s32 wd;
+ int ret;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
+
+ fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ parent->mark.mask = AUDIT_FS_WATCH;
+ ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ if (ret < 0) {
+ audit_free_parent(parent);
+ return ERR_PTR(ret);
}
return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
- if (!audit_ih)
+ if (!audit_watch_group)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
new->dev = old->dev;
new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
+ audit_get_parent(old->parent);
new->parent = old->parent;
out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
+ /* Run all of the watches on this parent looking for the one that
+ * matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
if (audit_compare_dname_path(dname, owatch->path, NULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
+ if (invalidating && !audit_dummy_context())
audit_filter_inodes(current, current->audit_context);
+ /* updating ino will likely change which audit_hash_list we
+ * are on so we need a new watch for the new list */
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ nentry = audit_dupe_rule(&oentry->rule);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
+
+ /*
+ * nentry->rule.watch == oentry->rule.watch so
+ * we must drop that reference and set it to our
+ * new watch.
+ */
+ audit_put_watch(nentry->rule.watch);
+ audit_get_watch(nwatch);
+ nentry->rule.watch = nwatch;
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
+ fsnotify_destroy_mark(&parent->mark);
}
/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
}
}
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
+ BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
}
if (!watch_found) {
- get_inotify_watch(&parent->wdata);
+ audit_get_parent(parent);
watch->parent = parent;
list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
-int audit_add_watch(struct audit_krule *krule)
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
struct audit_parent *parent;
struct nameidata *ndp = NULL, *ndw = NULL;
- int ret = 0;
+ int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
goto error;
}
+ mutex_lock(&audit_filter_mutex);
+
/* update watch filter fields */
if (ndw) {
watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
watch->ino = ndw->path.dentry->d_inode->i_ino;
}
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
+ /* either find an old parent or attach a new one */
+ parent = audit_find_parent(ndp->path.dentry->d_inode);
+ if (!parent) {
parent = audit_init_parent(ndp);
if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
ret = PTR_ERR(parent);
goto error;
}
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
+ }
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
+ audit_add_to_parent(krule, parent);
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
+ /* match get in audit_find_parent or audit_init_parent */
+ audit_put_parent(parent);
+ h = audit_hash_ino((u32)watch->ino);
+ *list = &audit_inode_hash[h];
error:
audit_put_nd(ndp, ndw); /* NULL args OK */
return ret;
}
-void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+void audit_remove_watch_rule(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it. */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, list);
+ audit_get_parent(parent);
+ fsnotify_destroy_mark(&parent->mark);
+ audit_put_parent(parent);
}
}
}
-/* Update watch data in audit rules based on inotify events. */
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ __u32 mask, void *data, int data_type)
+{
+ return true;
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ struct fsnotify_event *event)
{
+ struct inode *inode;
+ __u32 mask = event->mask;
+ const char *dname = event->file_name;
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
+ parent = container_of(inode_mark, struct audit_parent, mark);
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ BUG_ON(group != audit_watch_group);
+
+ switch (event->data_type) {
+ case (FSNOTIFY_EVENT_FILE):
+ inode = event->file->f_path.dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = event->inode;
+ break;
+ default:
+ BUG();
+ inode = NULL;
+ break;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+ else if (mask & (FS_DELETE|FS_MOVED_FROM))
audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
- audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
+ else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
+
+ return 0;
}
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+ .should_send_event = audit_watch_should_send_event,
+ .handle_event = audit_watch_handle_event,
+ .free_group_priv = NULL,
+ .freeing_mark = NULL,
+ .free_event_priv = NULL,
};
static int __init audit_watch_init(void)
{
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
return 0;
}
-subsys_initcall(audit_watch_init);
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
+
/* some rules don't have associated watches */
if (erule->watch)
audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
new->prio = old->prio;
new->buflen = old->buflen;
new->inode_f = old->inode_f;
- new->watch = NULL;
new->field_count = old->field_count;
+
/*
* note that we are OK with not refcounting here; audit_match_tree()
* never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
}
}
- if (watch) {
- audit_get_watch(watch);
- new->watch = watch;
+ if (old->watch) {
+ audit_get_watch(old->watch);
+ new->watch = old->watch;
}
return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int h, err;
+ int err;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule);
+ err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
goto error;
}
- /* entry->rule.watch may have changed during audit_add_watch() */
- watch = entry->rule.watch;
- h = audit_hash_ino((u32)audit_watch_inode(watch));
- list = &audit_inode_hash[h];
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- LIST_HEAD(inotify_list);
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
}
if (e->rule.watch)
- audit_remove_watch_rule(&e->rule, &inotify_list);
+ audit_remove_watch_rule(&e->rule);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- if (!list_empty(&inotify_list))
- audit_inotify_unregister(&inotify_list);
-
out:
if (watch)
audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
{
struct audit_entry *entry = container_of(r, struct audit_entry, rule);
struct audit_entry *nentry;
- struct audit_watch *watch;
- struct audit_tree *tree;
int err = 0;
if (!security_audit_rule_known(r))
return 0;
- watch = r->watch;
- tree = r->tree;
- nentry = audit_dupe_rule(r, watch);
+ nentry = audit_dupe_rule(r);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
err = PTR_ERR(nentry);
audit_panic("error updating LSM filters");
- if (watch)
+ if (r->watch)
list_del(&r->rlist);
list_del_rcu(&entry->list);
list_del(&r->list);
} else {
- if (watch) {
- list_add(&nentry->rule.rlist, audit_watch_rules(watch));
- list_del(&r->rlist);
- } else if (tree)
+ if (r->watch || r->tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
list_replace_rcu(&entry->list, &nentry->list);
list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..b87a63beb66c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/inotify.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
- result = (name->dev == audit_watch_dev(rule->watch) &&
- name->ino == audit_watch_inode(rule->watch));
+ if (name)
+ result = audit_watch_compare(rule->watch, name->ino, name->dev);
break;
case AUDIT_DIR:
if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(list_empty(&inode->inotify_watches)))
+ if (likely(hlist_empty(&inode->i_fsnotify_marks)))
return;
context = current->audit_context;
p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d->d_inode;
- if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291775021b2e..d83cab06da87 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct kobject *cgroup_kobj;
+
static inline struct cgroup *__d_cgrp(struct dentry *dentry)
{
return dentry->d_fsdata;
@@ -1788,6 +1790,29 @@ out:
return retval;
}
+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ struct cgroup *cur_cg;
+ int retval = 0;
+
+ cgroup_lock();
+ for_each_active_root(root) {
+ cur_cg = task_cgroup_from_root(current, root);
+ retval = cgroup_attach_task(cur_cg, tsk);
+ if (retval)
+ break;
+ }
+ cgroup_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
/*
* Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
* held. May take task_lock of task
@@ -2994,7 +3019,6 @@ static void cgroup_event_remove(struct work_struct *work)
remove);
struct cgroup *cgrp = event->cgrp;
- /* TODO: check return code */
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
eventfd_ctx_put(event->eventfd);
@@ -3872,9 +3896,18 @@ int __init cgroup_init(void)
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
BUG_ON(!init_root_id(&rootnode));
+
+ cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+ if (!cgroup_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
err = register_filesystem(&cgroup_fs_type);
- if (err < 0)
+ if (err < 0) {
+ kobject_put(cgroup_kobj);
goto out;
+ }
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
@@ -4599,7 +4632,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
parent_id = parent_css->id;
- depth = parent_id->depth;
+ depth = parent_id->depth + 1;
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 545777574779..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,13 +20,29 @@
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
+#ifdef CONFIG_HOTPLUG_CPU
+
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -41,8 +57,6 @@ static struct {
.refcount = 0,
};
-#ifdef CONFIG_HOTPLUG_CPU
-
void get_online_cpus(void)
{
might_sleep();
@@ -67,22 +81,6 @@ void put_online_cpus(void)
}
EXPORT_SYMBOL_GPL(put_online_cpus);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
- mutex_lock(&cpu_add_remove_lock);
-}
-
-void cpu_maps_update_done(void)
-{
- mutex_unlock(&cpu_add_remove_lock);
-}
-
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
@@ -124,6 +122,12 @@ static void cpu_hotplug_done(void)
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
}
+
+#else /* #if CONFIG_HOTPLUG_CPU */
+static void cpu_hotplug_begin(void) {}
+static void cpu_hotplug_done(void) {}
+#endif /* #esle #if CONFIG_HOTPLUG_CPU */
+
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
@@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+ return __cpu_notify(val, v, -1, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
+
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -181,8 +206,7 @@ static int __ref take_cpu_down(void *_param)
if (err < 0)
return err;
- raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
- param->hcpu);
+ cpu_notify(CPU_DYING | param->mod, param->hcpu);
if (task_cpu(param->caller) == cpu)
move_task_off_dead_cpu(cpu, param->caller);
@@ -211,28 +235,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
- set_cpu_active(cpu, false);
- err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
- hcpu, -1, &nr_calls);
- if (err == NOTIFY_BAD) {
- set_cpu_active(cpu, true);
-
+ err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (err) {
nr_calls--;
- __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__func__, cpu);
- err = -EINVAL;
goto out_release;
}
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
- set_cpu_active(cpu, true);
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
goto out_release;
}
@@ -246,19 +261,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
out_release:
cpu_hotplug_done();
- if (!err) {
- if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
- }
+ if (!err)
+ cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
}
@@ -293,13 +303,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
- ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
- -1, &nr_calls);
- if (ret == NOTIFY_BAD) {
+ ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (ret) {
nr_calls--;
printk("%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
- ret = -EINVAL;
goto out_notify;
}
@@ -309,15 +317,12 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- set_cpu_active(cpu, true);
-
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+ cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- __raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
cpu_hotplug_done();
return ret;
@@ -326,6 +331,12 @@ out_notify:
int __cpuinit cpu_up(unsigned int cpu)
{
int err = 0;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ int nid;
+ pg_data_t *pgdat;
+#endif
+
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
@@ -336,6 +347,28 @@ int __cpuinit cpu_up(unsigned int cpu)
return -EINVAL;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+ nid = cpu_to_node(cpu);
+ if (!node_online(nid)) {
+ err = mem_online_node(nid);
+ if (err)
+ return err;
+ }
+
+ pgdat = NODE_DATA(nid);
+ if (!pgdat) {
+ printk(KERN_ERR
+ "Can't online cpu %d due to NULL pgdat\n", cpu);
+ return -ENOMEM;
+ }
+
+ if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
+#endif
+
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -355,7 +388,7 @@ static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
- int cpu, first_cpu, error;
+ int cpu, first_cpu, error = 0;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
@@ -453,7 +486,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
- raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+ cpu_notify(val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a50c5f6e727..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset heirarchy */
+ /* used for walking a cpuset hierarchy */
struct list_head stack_list;
};
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
* In order to avoid seeing no nodes if the old and new nodes are disjoint,
* we structure updates as setting all new allowed nodes, then clearing newly
* disallowed ones.
- *
- * Called with task's alloc_lock held
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
+repeat:
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return;
+
+ task_lock(tsk);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- mpol_rebind_task(tsk, newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+
+ /*
+ * ensure checking ->mems_allowed_change_disable after setting all new
+ * allowed nodes.
+ *
+ * the read-side task can see an nodemask with new allowed nodes and
+ * old allowed nodes. and if it allocates page when cpuset clears newly
+ * disallowed ones continuous, it can see the new allowed bits.
+ *
+ * And if setting all new allowed nodes is after the checking, setting
+ * all new allowed nodes and clearing newly disallowed ones will be done
+ * continuous, and the read-side task may find no node to alloc page.
+ */
+ smp_mb();
+
+ /*
+ * Allocation of memory is very fast, we needn't sleep when waiting
+ * for the read-side.
+ */
+ while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+ task_unlock(tsk);
+ if (!task_curr(tsk))
+ yield();
+ goto repeat;
+ }
+
+ /*
+ * ensure checking ->mems_allowed_change_disable before clearing all new
+ * disallowed nodes.
+ *
+ * if clearing newly disallowed bits before the checking, the read-side
+ * task may find no node to alloc page.
+ */
+ smp_mb();
+
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
+ task_unlock(tsk);
}
/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, newmems);
- task_lock(p);
cpuset_change_task_nodemask(p, newmems);
- task_unlock(p);
NODEMASK_FREE(newmems);
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- task_lock(tsk);
cpuset_change_task_nodemask(tsk, to);
- task_unlock(tsk);
cpuset_update_task_spread_flag(cs, tsk);
}
@@ -2071,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* but making no active use of cpusets.
*
* This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*/
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
- unsigned long phase, void *unused_cpu)
+void cpuset_update_active_cpus(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
- switch (phase) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
cgroup_lock();
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2106,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
- return NOTIFY_OK;
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2161,7 +2187,6 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
cpuset_wq = create_singlethread_workqueue("cpuset");
@@ -2427,7 +2452,8 @@ void cpuset_unlock(void)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2452,16 +2478,27 @@ void cpuset_unlock(void)
* See kmem_cache_alloc_node().
*/
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
{
int node;
- node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ node = next_node(*rotor, current->mems_allowed);
if (node == MAX_NUMNODES)
node = first_node(current->mems_allowed);
- current->cpuset_mem_spread_rotor = node;
+ *rotor = node;
return node;
}
+
+int cpuset_mem_spread_node(void)
+{
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+ return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
diff --git a/kernel/cred.c b/kernel/cred.c
index 2c24870c55d1..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
}
}
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away. Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+ const struct cred *cred;
+
+ rcu_read_lock();
+
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+ rcu_read_unlock();
+ return cred;
+}
+
/*
* Allocate blank credentials, such that the credentials can be filled in at a
* later date without risk of ENOMEM.
@@ -347,66 +372,6 @@ struct cred *prepare_exec_creds(void)
}
/*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = NULL;
-#endif
- struct cred *new;
-
-#ifdef CONFIG_KEYS
- tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
- if (!tgcred)
- return NULL;
-#endif
-
- new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
- if (!new)
- goto free_tgcred;
-
- kdebug("prepare_usermodehelper_creds() alloc %p", new);
-
- memcpy(new, &init_cred, sizeof(struct cred));
-
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
- get_group_info(new->group_info);
- get_uid(new->user);
-
-#ifdef CONFIG_KEYS
- new->thread_keyring = NULL;
- new->request_key_auth = NULL;
- new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- new->tgcred = tgcred;
-#endif
-
-#ifdef CONFIG_SECURITY
- new->security = NULL;
-#endif
- if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
- goto error;
- validate_creds(new);
-
- BUG_ON(atomic_read(&new->usage) != 1);
- return new;
-
-error:
- put_cred(new);
- return NULL;
-
-free_tgcred:
-#ifdef CONFIG_KEYS
- kfree(tgcred);
-#endif
- return NULL;
-}
-
-/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..3c2d4972d235
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,985 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+#include "debug_core.h"
+
+static int kgdb_break_asap;
+
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+struct kgdb_io *dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t passive_cpu_wait[NR_CPUS];
+static atomic_t cpu_in_kgdb[NR_CPUS];
+static atomic_t kgdb_break_tasklet_var;
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+static pid_t kgdb_sstep_pid;
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+ int err;
+
+ err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+
+ return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+ return probe_kernel_write((char *)addr,
+ (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ char tmp_variable[BREAK_INSTR_SIZE];
+ int err;
+ /* Validate setting the breakpoint and then removing it. In the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ if (err)
+ printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+ "memory destroyed at: %lx", addr);
+ return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm && current->mm->mmap_cache) {
+ flush_cache_range(current->mm->mmap_cache,
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_set_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error) {
+ ret = error;
+ printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+ continue;
+ }
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return ret;
+}
+
+int dbg_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+int dbg_deactivate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error) {
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+ ret = error;
+ }
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return ret;
+}
+
+int dbg_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+int dbg_remove_all_break(void)
+{
+ unsigned long addr;
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!dbg_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+ }
+ return 1;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ dbg_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (dbg_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+ addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ dbg_remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+ /* Allow kdb to debug itself one level */
+ return 0;
+#endif
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+ /* Mark the cpu we are switching away from as a slave when it
+ * holds the kgdb_active token. This must be done so that the
+ * that all the cpus wait in for the debug core will not enter
+ * again as the master. */
+ if (cpu == atomic_read(&kgdb_active)) {
+ kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+ kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+ }
+ kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+{
+ unsigned long flags;
+ int sstep_tries = 100;
+ int error;
+ int i, cpu;
+ int trace_on = 0;
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = ks->cpu;
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ kgdb_info[cpu].ret_state = 0;
+ kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+ /*
+ * Make sure the above info reaches the primary CPU before
+ * our cpu_in_kgdb[] flag setting does:
+ */
+ atomic_inc(&cpu_in_kgdb[cpu]);
+
+ if (exception_level == 1)
+ goto cpu_master_loop;
+
+ /*
+ * CPU will loop if it is a slave or request to become a kgdb
+ * master cpu and acquire the kgdb_active lock:
+ */
+ while (1) {
+cpu_loop:
+ if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+ kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+ goto cpu_master_loop;
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+ if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+ break;
+ } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+ if (!atomic_read(&passive_cpu_wait[cpu]))
+ goto return_normal;
+ } else {
+return_normal:
+ /* Return to normal operation by executing any
+ * hw breakpoint fixup.
+ */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+ atomic_dec(&cpu_in_kgdb[cpu]);
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+ return 0;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * For single stepping, try to only enter on the processor
+ * that was single stepping. To gaurd against a deadlock, the
+ * kernel will only try for the value of sstep_tries before
+ * giving up and continuing on.
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ (kgdb_info[cpu].task &&
+ kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+ atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ kgdb_info[cpu].ret_state = 1;
+ goto kgdb_restore; /* No I/O connection, resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (dbg_io_ops->pre_exception)
+ dbg_io_ops->pre_exception();
+
+ kgdb_disable_hw_debug(ks->linux_regs);
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step) {
+ for (i = 0; i < NR_CPUS; i++)
+ atomic_inc(&passive_cpu_wait[i]);
+ }
+
+#ifdef CONFIG_SMP
+ /* Signal the other CPUs to enter kgdb_wait() */
+ if ((!kgdb_single_step) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ for_each_online_cpu(i) {
+ while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
+ cpu_relax();
+ }
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ dbg_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = current;
+ exception_level = 0;
+ trace_on = tracing_is_on();
+ if (trace_on)
+ tracing_off();
+
+ while (1) {
+cpu_master_loop:
+ if (dbg_kdb_mode) {
+ kgdb_connected = 1;
+ error = kdb_stub(ks);
+ if (error == -1)
+ continue;
+ kgdb_connected = 0;
+ } else {
+ error = gdb_serial_stub(ks);
+ }
+
+ if (error == DBG_PASS_EVENT) {
+ dbg_kdb_mode = !dbg_kdb_mode;
+ } else if (error == DBG_SWITCH_CPU_EVENT) {
+ dbg_cpu_switch(cpu, dbg_switch_cpu);
+ goto cpu_loop;
+ } else {
+ kgdb_info[cpu].ret_state = error;
+ break;
+ }
+ }
+
+ /* Call the I/O driver's post_exception routine */
+ if (dbg_io_ops->post_exception)
+ dbg_io_ops->post_exception();
+
+ atomic_dec(&cpu_in_kgdb[ks->cpu]);
+
+ if (!kgdb_single_step) {
+ for (i = NR_CPUS-1; i >= 0; i--)
+ atomic_dec(&passive_cpu_wait[i]);
+ /*
+ * Wait till all the CPUs have quit from the debugger,
+ * but allow a CPU that hit an exception and is
+ * waiting to become the master to remain in the debug
+ * core.
+ */
+ for_each_online_cpu(i) {
+ while (kgdb_do_roundup &&
+ atomic_read(&cpu_in_kgdb[i]) &&
+ !(kgdb_info[i].exception_state &
+ DCPU_WANT_MASTER))
+ cpu_relax();
+ }
+ }
+
+kgdb_restore:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+ if (kgdb_info[sstep_cpu].task)
+ kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+ else
+ kgdb_sstep_pid = 0;
+ }
+ if (trace_on)
+ tracing_on();
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ return kgdb_info[cpu].ret_state;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ int ret;
+
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->err_code = ecode;
+ ks->kgdb_usethreadid = 0;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ return 0; /* Ouch, double exception ! */
+ kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+ ret = kgdb_cpu_enter(ks, regs);
+ kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+ DCPU_IS_SLAVE);
+ return ret;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->linux_regs = regs;
+
+ if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+ atomic_read(&kgdb_active) != -1 &&
+ atomic_read(&kgdb_active) != cpu) {
+ kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+ kgdb_cpu_enter(ks, regs);
+ kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+ return;
+
+ local_irq_save(flags);
+ gdbstub_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key, struct tty_struct *tty)
+{
+ if (!dbg_io_ops) {
+ printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "Entering KGDB\n");
+#endif
+ }
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_dbg_op = {
+ .handler = sysrq_handle_dbg,
+ .help_msg = "debug(G)",
+ .action_msg = "DEBUG",
+};
+#endif
+
+static int kgdb_panic_event(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (dbg_kdb_mode)
+ kdb_printf("PANIC: %s\n", (char *)data);
+ kgdb_breakpoint();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+ .notifier_call = kgdb_panic_event,
+ .priority = INT_MAX,
+};
+
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+ dbg_is_early = false;
+ if (kgdb_io_module_registered)
+ kgdb_arch_late();
+ kdb_init(KDB_INIT_FULL);
+}
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+ if (!dbg_is_early)
+ kgdb_arch_late();
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+ kgdb_breakpoint();
+ atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+ if (atomic_read(&kgdb_break_tasklet_var) ||
+ atomic_read(&kgdb_active) != -1 ||
+ atomic_read(&kgdb_setting_breakpoint))
+ return;
+ atomic_inc(&kgdb_break_tasklet_var);
+ tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_dbg_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (dbg_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_ERR "kgdb: Another I/O driver is already "
+ "registered with KGDB.\n");
+ return -EBUSY;
+ }
+
+ if (new_dbg_io_ops->init) {
+ err = new_dbg_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ dbg_io_ops = new_dbg_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+ new_dbg_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_dbg_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+ dbg_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO
+ "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+int dbg_io_get_char(void)
+{
+ int ret = dbg_io_ops->read_char();
+ if (ret == NO_POLL_CHAR)
+ return -1;
+ if (!dbg_kdb_mode)
+ return ret;
+ if (ret == 127)
+ return 8;
+ return ret;
+}
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+ atomic_inc(&kgdb_setting_breakpoint);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ kdb_init(KDB_INIT_EARLY);
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ unsigned long thr_query;
+ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+ int exception_state;
+ int ret_state;
+ int irq_depth;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+ return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..481a7bd2dfe7
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1095 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+ int ret = -1;
+ int i;
+
+ /* poll any additional I/O interfaces that are defined */
+ while (ret < 0)
+ for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+ ret = kdb_poll_funcs[i]();
+ if (ret > 0)
+ break;
+ }
+ return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+ int ret = dbg_io_ops->read_char();
+ while (ret == NO_POLL_CHAR)
+ ret = dbg_io_ops->read_char();
+ return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (gdbstub_read_wait())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = gdbstub_read_wait();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+ buffer[count] = 0;
+
+ if (ch == '#') {
+ xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+ xmitcsum += hex_to_bin(gdbstub_read_wait());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ dbg_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ dbg_io_ops->write_char('+');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ }
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ dbg_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = gdbstub_read_wait();
+
+ if (ch == 3)
+ ch = gdbstub_read_wait();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ dbg_io_ops->write_char('-');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ return;
+ }
+ }
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ if (len == 0)
+ len = strlen(s);
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = pack_hex_byte(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf. Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+char *kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (err)
+ return NULL;
+ while (count > 0) {
+ buf = pack_hex_byte(buf, *tmp);
+ tmp++;
+ count--;
+ }
+ *buf = 0;
+
+ return buf;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem. Return a pointer to the character AFTER the last byte
+ * written. May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex_to_bin(*tmp_hex--);
+ *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+ int hex_val;
+ int num = 0;
+ int negate = 0;
+
+ *long_val = 0;
+
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
+ while (**ptr) {
+ hex_val = hex_to_bin(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ if (negate)
+ *long_val = -*long_val;
+
+ return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int size = 0;
+ char *c = buf;
+
+ while (count-- > 0) {
+ c[size] = *buf++;
+ if (c[size] == 0x7d)
+ c[size] = *buf++ ^ 0x20;
+ size++;
+ }
+
+ return probe_kernel_write(mem, c, size);
+}
+
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_get_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_set_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 8
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ unsigned char *limit;
+ int lzero = 1;
+
+ limit = id + (BUF_THREAD_ID_SIZE / 2);
+ while (id < limit) {
+ if (!lzero || *id != 0) {
+ pkt = pack_hex_byte(pkt, *id);
+ lzero = 0;
+ }
+ id++;
+ }
+
+ if (lzero)
+ pkt = pack_hex_byte(pkt, 0);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ put_unaligned_be32(value, id);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped to the cpu shadow information
+ */
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < -1 && tid > -NR_CPUS - 2) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
+ if (tid <= 0) {
+ printk(KERN_ERR "KGDB: Internal thread select error\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ dbg_remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+static void gdb_get_regs_helper(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for_each_online_cpu(i) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ gdb_get_regs_helper(ks);
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ char *err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (!err)
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+ int i;
+ int offset = 0;
+
+ for (i = 0; i < regnum; i++)
+ offset += dbg_reg_def[i].size;
+ return kgdb_mem2hex((char *)gdb_regs + offset, out,
+ dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (regnum >= DBG_MAX_REG_NUM) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ gdb_get_regs_helper(ks);
+ gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+ int i = 0;
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (*ptr++ != '=' ||
+ !(!kgdb_usethread || kgdb_usethread == current) ||
+ !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ memset(gdb_regs, 0, sizeof(gdb_regs));
+ while (i < sizeof(gdb_regs) * 2)
+ if (hex_to_bin(ptr[i]) >= 0)
+ i++;
+ else
+ break;
+ i = i / 2;
+ kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+ dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = dbg_remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+ int i;
+ int cpu;
+ int finished = 0;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+ break;
+
+ i = 0;
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+ break;
+
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if ((int)ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case 'R':
+ if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+ int len = strlen(remcom_in_buffer + 6);
+
+ if ((len % 2) != 0) {
+ strcpy(remcom_out_buffer, "E01");
+ break;
+ }
+ kgdb_hex2mem(remcom_in_buffer + 6,
+ remcom_out_buffer, len);
+ len = len / 2;
+ remcom_out_buffer[len++] = 0;
+
+ kdb_parse(remcom_out_buffer);
+ strcpy(remcom_out_buffer, "OK");
+ }
+ break;
+#endif
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = dbg_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = dbg_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+ " and 15 (pass and disconnect)\n"
+ "Executing a continue without signal passing\n", 0);
+ remcom_in_buffer[0] = 'c';
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Initialize comm buffer and globals. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ if (kgdb_connected) {
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = pack_hex_byte(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+#if DBG_MAX_REG_NUM > 0
+ case 'p': /* pXX Return gdb register XX (in hex) */
+ gdb_cmd_reg_get(ks);
+ break;
+ case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+ gdb_cmd_reg_set(ks);
+ break;
+#endif /* DBG_MAX_REG_NUM > 0 */
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case '3': /* Escape into back into kdb */
+ if (remcom_in_buffer[1] == '\0') {
+ gdb_cmd_detachkill(ks);
+ return DBG_PASS_EVENT;
+ }
+#endif
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ dbg_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+ int error;
+
+ switch (cmd[0]) {
+ case 'e':
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ return error;
+ case 's':
+ case 'c':
+ strcpy(remcom_in_buffer, cmd);
+ return 0;
+ case '?':
+ gdb_cmd_status(ks);
+ break;
+ case '\0':
+ strcpy(remcom_out_buffer, "");
+ break;
+ }
+ dbg_io_ops->write_char('+');
+ put_packet(remcom_out_buffer);
+ return 0;
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB $@
+ cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+ /^\#/{next} \
+ /^[ \t]*$$/{next} \
+ {gsub(/"/, "\\\"", $$0); \
+ print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+ END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
+ $(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
+ $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+ KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+ "Instruction(i)",
+ "Instruction(Register)",
+ "Data Write",
+ "I/O",
+ "Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+ if (bp->bp_type < 0 || bp->bp_type > 4)
+ return "";
+
+ return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+ int nextarg = *nextargp;
+ int diag;
+
+ bp->bph_length = 1;
+ if ((argc + 1) != nextarg) {
+ if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ bp->bp_type = BP_ACCESS_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ bp->bp_type = BP_WRITE_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ bp->bp_type = BP_HARDWARE_BREAKPOINT;
+ else
+ return KDB_ARGCOUNT;
+
+ bp->bph_length = 1;
+
+ nextarg++;
+
+ if ((argc + 1) != nextarg) {
+ unsigned long len;
+
+ diag = kdbgetularg((char *)argv[nextarg],
+ &len);
+ if (diag)
+ return diag;
+
+
+ if (len > 8)
+ return KDB_BADLENGTH;
+
+ bp->bph_length = len;
+ nextarg++;
+ }
+
+ if ((argc + 1) != nextarg)
+ return KDB_ARGCOUNT;
+ }
+
+ *nextargp = nextarg;
+ return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+ int ret = 1;
+ if (!bp->bp_installed)
+ return ret;
+ if (!bp->bp_type)
+ ret = dbg_remove_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0)
+ bp->bp_installed = 0;
+ return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ if (KDB_DEBUG(BP))
+ kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+ /*
+ * Setup single step
+ */
+ kdb_setsinglestep(regs);
+
+ /*
+ * Reset delay attribute
+ */
+ bp->bp_delay = 0;
+ bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ int ret;
+ /*
+ * Install the breakpoint, if it is not already installed.
+ */
+
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: bp_installed %d\n",
+ __func__, bp->bp_installed);
+ if (!KDB_STATE(SSBPT))
+ bp->bp_delay = 0;
+ if (bp->bp_installed)
+ return 1;
+ if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: delayed bp\n", __func__);
+ kdb_handle_bp(regs, bp);
+ return 0;
+ }
+ if (!bp->bp_type)
+ ret = dbg_set_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0) {
+ bp->bp_installed = 1;
+ } else {
+ kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+ __func__, bp->bp_addr);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ * Install kdb_breakpoints prior to returning from the
+ * kernel debugger. This allows the kdb_breakpoints to be set
+ * upon functions that are used internally by kdb, such as
+ * printk(). This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+ int i;
+
+ for (i = 0; i < KDB_MAXBPT; i++) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_install(regs, bp);
+ }
+}
+
+/*
+ * kdb_bp_remove
+ *
+ * Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+ int i;
+
+ for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_remove(bp);
+ }
+}
+
+
+/*
+ * kdb_printbp
+ *
+ * Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+ kdb_printf("%s ", kdb_bptype(bp));
+ kdb_printf("BP #%d at ", i);
+ kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+ if (bp->bp_enabled)
+ kdb_printf("\n is enabled");
+ else
+ kdb_printf("\n is disabled");
+
+ kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ * Handle the bp commands.
+ *
+ * [bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * bp Set breakpoint on all cpus. Only use hardware assist if need.
+ * bph Set breakpoint on all cpus. Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+ int i, bpno;
+ kdb_bp_t *bp, *bp_check;
+ int diag;
+ int free;
+ char *symname = NULL;
+ long offset = 0ul;
+ int nextarg;
+ kdb_bp_t template = {0};
+
+ if (argc == 0) {
+ /*
+ * Display breakpoint table
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+ bpno++, bp++) {
+ if (bp->bp_free)
+ continue;
+ kdb_printbp(bp, bpno);
+ }
+
+ return 0;
+ }
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+ &offset, &symname);
+ if (diag)
+ return diag;
+ if (!template.bp_addr)
+ return KDB_BADINT;
+
+ /*
+ * Find an empty bp structure to allocate
+ */
+ free = KDB_MAXBPT;
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+ if (bp->bp_free)
+ break;
+ }
+
+ if (bpno == KDB_MAXBPT)
+ return KDB_TOOMANYBPT;
+
+ if (strcmp(argv[0], "bph") == 0) {
+ template.bp_type = BP_HARDWARE_BREAKPOINT;
+ diag = kdb_parsebp(argc, argv, &nextarg, &template);
+ if (diag)
+ return diag;
+ } else {
+ template.bp_type = BP_BREAKPOINT;
+ }
+
+ /*
+ * Check for clashing breakpoints.
+ *
+ * Note, in this design we can't have hardware breakpoints
+ * enabled for both read and write on the same address.
+ */
+ for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp_check++) {
+ if (!bp_check->bp_free &&
+ bp_check->bp_addr == template.bp_addr) {
+ kdb_printf("You already have a breakpoint at "
+ kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+ return KDB_DUPBPT;
+ }
+ }
+
+ template.bp_enabled = 1;
+
+ /*
+ * Actually allocate the breakpoint found earlier
+ */
+ *bp = template;
+ bp->bp_free = 0;
+
+ kdb_printbp(bp, bpno);
+
+ return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ * Handles the 'bc', 'be', and 'bd' commands
+ *
+ * [bd|bc|be] <breakpoint-number>
+ * [bd|bc|be] *
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic for failure
+ * Locking:
+ * None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+ unsigned long addr;
+ kdb_bp_t *bp = NULL;
+ int lowbp = KDB_MAXBPT;
+ int highbp = 0;
+ int done = 0;
+ int i;
+ int diag = 0;
+
+ int cmd; /* KDBCMD_B? */
+#define KDBCMD_BC 0
+#define KDBCMD_BE 1
+#define KDBCMD_BD 2
+
+ if (strcmp(argv[0], "be") == 0)
+ cmd = KDBCMD_BE;
+ else if (strcmp(argv[0], "bd") == 0)
+ cmd = KDBCMD_BD;
+ else
+ cmd = KDBCMD_BC;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ if (strcmp(argv[1], "*") == 0) {
+ lowbp = 0;
+ highbp = KDB_MAXBPT;
+ } else {
+ diag = kdbgetularg(argv[1], &addr);
+ if (diag)
+ return diag;
+
+ /*
+ * For addresses less than the maximum breakpoint number,
+ * assume that the breakpoint number is desired.
+ */
+ if (addr < KDB_MAXBPT) {
+ bp = &kdb_breakpoints[addr];
+ lowbp = highbp = addr;
+ highbp++;
+ } else {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp++) {
+ if (bp->bp_addr == addr) {
+ lowbp = highbp = i;
+ highbp++;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Now operate on the set of breakpoints matching the input
+ * criteria (either '*' for all, or an individual breakpoint).
+ */
+ for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+ i < highbp;
+ i++, bp++) {
+ if (bp->bp_free)
+ continue;
+
+ done++;
+
+ switch (cmd) {
+ case KDBCMD_BC:
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " cleared\n",
+ i, bp->bp_addr);
+
+ bp->bp_addr = 0;
+ bp->bp_free = 1;
+
+ break;
+ case KDBCMD_BE:
+ bp->bp_enabled = 1;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " enabled",
+ i, bp->bp_addr);
+
+ kdb_printf("\n");
+ break;
+ case KDBCMD_BD:
+ if (!bp->bp_enabled)
+ break;
+
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " disabled\n",
+ i, bp->bp_addr);
+
+ break;
+ }
+ if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+ bp->bp_delay = 0;
+ KDB_STATE_CLEAR(SSBPT);
+ }
+ }
+
+ return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ * commands.
+ *
+ * ss
+ * ssb
+ *
+ * Parameters:
+ * argc Argument count
+ * argv Argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * Set the arch specific option to trigger a debug trap after the next
+ * instruction.
+ *
+ * For 'ssb', set the trace flag in the debug trap handler
+ * after printing the current insn and return directly without
+ * invoking the kdb command processor, until a branch instruction
+ * is encountered.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+ int ssb = 0;
+
+ ssb = (strcmp(argv[0], "ssb") == 0);
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ /*
+ * Set trace flag and go.
+ */
+ KDB_STATE_SET(DOING_SS);
+ if (ssb) {
+ KDB_STATE_SET(DOING_SSB);
+ return KDB_CMD_SSB;
+ }
+ return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+ int i;
+ kdb_bp_t *bp;
+
+ /*
+ * First time initialization.
+ */
+ memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+ bp->bp_free = 1;
+
+ kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+ kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+
+ kdb_register_repeat("ss", kdb_ss, "",
+ "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("ssb", kdb_ss, "",
+ "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
+ /*
+ * Architecture dependent initialization.
+ */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include <asm/system.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = 15;
+ kdb_trap_printk++;
+ kdb_set_current_task(p);
+ if (addr) {
+ show_stack((struct task_struct *)p, addr);
+ } else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+ show_stack(p, &kdb_current_regs->sp);
+#else
+ show_stack(p, NULL);
+#endif
+ } else {
+ show_stack(p, NULL);
+ }
+ console_loglevel = old_lvl;
+ kdb_trap_printk--;
+}
+
+/*
+ * kdb_bt
+ *
+ * This function implements the 'bt' command. Print a stack
+ * traceback.
+ *
+ * bt [<address-expression>] (addr-exp is for alternate stacks)
+ * btp <pid> Kernel stack for <pid>
+ * btt <address-expression> Kernel stack for task structure at
+ * <address-expression>
+ * bta [DRSTCZEUIMA] All useful processes, optionally
+ * filtered by state
+ * btc [<cpu>] The current process on one cpu,
+ * default is all cpus
+ *
+ * bt <address-expression> refers to a address on the stack, that location
+ * is assumed to contain a return address.
+ *
+ * btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * Backtrack works best when the code uses frame pointers. But even
+ * without frame pointers we should get a reasonable trace.
+ *
+ * mds comes in handy when examining the stack to do a manual traceback or
+ * to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+ int argcount, int btaprompt)
+{
+ char buffer[2];
+ if (kdb_getarea(buffer[0], (unsigned long)p) ||
+ kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ return KDB_BADADDR;
+ if (!kdb_task_state(p, mask))
+ return 0;
+ kdb_printf("Stack traceback for pid %d\n", p->pid);
+ kdb_ps1(p);
+ kdb_show_stack(p, NULL);
+ if (btaprompt) {
+ kdb_getstr(buffer, sizeof(buffer),
+ "Enter <q> to end, <cr> to continue:");
+ if (buffer[0] == 'q') {
+ kdb_printf("\n");
+ return 1;
+ }
+ }
+ touch_nmi_watchdog();
+ return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+ int diag;
+ int argcount = 5;
+ int btaprompt = 1;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+
+ kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
+ kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
+ * proc in bta */
+
+ if (strcmp(argv[0], "bta") == 0) {
+ struct task_struct *g, *p;
+ unsigned long cpu;
+ unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+ NULL);
+ if (argc == 0)
+ kdb_ps_suppressed();
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ }
+ /* Now the inactive tasks */
+ kdb_do_each_thread(g, p) {
+ if (task_curr(p))
+ continue;
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ } kdb_while_each_thread(g, p);
+ } else if (strcmp(argv[0], "btp") == 0) {
+ struct task_struct *p;
+ unsigned long pid;
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &pid);
+ if (diag)
+ return diag;
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p) {
+ kdb_set_current_task(p);
+ return kdb_bt1(p, ~0UL, argcount, 0);
+ }
+ kdb_printf("No process with pid == %ld found\n", pid);
+ return 0;
+ } else if (strcmp(argv[0], "btt") == 0) {
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &addr);
+ if (diag)
+ return diag;
+ kdb_set_current_task((struct task_struct *)addr);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ } else if (strcmp(argv[0], "btc") == 0) {
+ unsigned long cpu = ~0;
+ struct task_struct *save_current_task = kdb_current_task;
+ char buf[80];
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+ if (argc == 1) {
+ diag = kdbgetularg((char *)argv[1], &cpu);
+ if (diag)
+ return diag;
+ }
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ if (cpu != ~0) {
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("no process for cpu %ld\n", cpu);
+ return 0;
+ }
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ return 0;
+ }
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ touch_nmi_watchdog();
+ }
+ kdb_set_current_task(save_current_task);
+ return 0;
+ } else {
+ if (argc) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ kdb_show_stack(kdb_current_task, (void *)addr);
+ return 0;
+ } else {
+ return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes. Commands that require process data (including stack or
+# registers) are not reliable this early. set and bp commands should
+# be safe. Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+ set BTAPROMPT 0
+ set LINES 10000
+ -summary
+ -cpu
+ -ps
+ -dmesg 600
+ -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+ set BTSYMARG 1
+ set BTARGS 9
+ pid R
+ -dumpcommon
+ -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+ set BTSYMARG 1
+ set BTARGS 9
+ pid R
+ -dumpcommon
+ -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+ dbg_io_get_char,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+
+int kdb_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ kdb_bp_t *bp;
+ unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kdb_reason_t reason = KDB_REASON_OOPS;
+ kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+ int i;
+
+ if (KDB_STATE(REENTRY)) {
+ reason = KDB_REASON_SWITCH;
+ KDB_STATE_CLEAR(REENTRY);
+ addr = instruction_pointer(ks->linux_regs);
+ }
+ ks->pass_exception = 0;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ reason = KDB_REASON_KEYBOARD;
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ if (addr != instruction_pointer(ks->linux_regs))
+ kgdb_arch_set_pc(ks->linux_regs, addr);
+ break;
+ }
+ }
+ if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if (bp->bp_free)
+ continue;
+ if (bp->bp_addr == addr) {
+ bp->bp_delay = 1;
+ bp->bp_delayed = 1;
+ /*
+ * SSBPT is set when the kernel debugger must single step a
+ * task in order to re-establish an instruction breakpoint
+ * which uses the instruction replacement mechanism. It is
+ * cleared by any action that removes the need to single-step
+ * the breakpoint.
+ */
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ KDB_STATE_SET(SSBPT);
+ break;
+ }
+ }
+ }
+
+ if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+ ks->signo == SIGTRAP) {
+ reason = KDB_REASON_SSTEP;
+ db_result = KDB_DB_BPT;
+ }
+ /* Set initial kdb state variables */
+ KDB_STATE_CLEAR(KGDB_TRANS);
+ kdb_initial_cpu = ks->cpu;
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ /* Remove any breakpoints as needed by kdb and clear single step */
+ kdb_bp_remove();
+ KDB_STATE_CLEAR(DOING_SS);
+ KDB_STATE_CLEAR(DOING_SSB);
+ KDB_STATE_SET(PAGER);
+ /* zero out any offline cpu data */
+ for_each_present_cpu(i) {
+ if (!cpu_online(i)) {
+ kgdb_info[i].debuggerinfo = NULL;
+ kgdb_info[i].task = NULL;
+ }
+ }
+ if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+ ks->pass_exception = 1;
+ KDB_FLAG_SET(CATASTROPHIC);
+ }
+ kdb_initial_cpu = ks->cpu;
+ if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+ KDB_STATE_CLEAR(SSBPT);
+ KDB_STATE_CLEAR(DOING_SS);
+ } else {
+ /* Start kdb main loop */
+ error = kdb_main_loop(KDB_REASON_ENTER, reason,
+ ks->err_code, db_result, ks->linux_regs);
+ }
+ /*
+ * Upon exit from the kdb main loop setup break points and restart
+ * the system based on the requested continue state
+ */
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ KDB_STATE_CLEAR(PAGER);
+ kdbnearsym_cleanup();
+ if (error == KDB_CMD_KGDB) {
+ if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+ /*
+ * This inteface glue which allows kdb to transition in into
+ * the gdb stub. In order to do this the '?' or '' gdb serial
+ * packet response is processed here. And then control is
+ * passed to the gdbstub.
+ */
+ if (KDB_STATE(DOING_KGDB))
+ gdbstub_state(ks, "?");
+ else
+ gdbstub_state(ks, "");
+ KDB_STATE_CLEAR(DOING_KGDB);
+ KDB_STATE_CLEAR(DOING_KGDB2);
+ }
+ return DBG_PASS_EVENT;
+ }
+ kdb_bp_install(ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ /* Set the exit state to a single step or a continue */
+ if (KDB_STATE(DOING_SS))
+ gdbstub_state(ks, "s");
+ else
+ gdbstub_state(ks, "c");
+
+ KDB_FLAG_CLEAR(CATASTROPHIC);
+
+ /* Invoke arch specific exception handling prior to system resume */
+ kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+ if (ks->pass_exception)
+ kgdb_info[ks->cpu].ret_state = 1;
+ if (error == KDB_CMD_CPU) {
+ KDB_STATE_SET(REENTRY);
+ /*
+ * Force clear the single step bit because kdb emulates this
+ * differently vs the gdbstub
+ */
+ kgdb_single_step = 0;
+ dbg_deactivate_sw_breakpoints();
+ return DBG_SWITCH_CPU_EVENT;
+ }
+ return kgdb_info[ks->cpu].ret_state;
+}
+
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+int kdb_trap_printk;
+
+static void kgdb_transition_check(char *buffer)
+{
+ int slen = strlen(buffer);
+ if (strncmp(buffer, "$?#3f", slen) != 0 &&
+ strncmp(buffer, "$qSupported#37", slen) != 0 &&
+ strncmp(buffer, "+$qSupported#37", slen) != 0) {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ }
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+ char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
+ char *ped = escape_data;
+ int escape_delay = 0;
+ get_char_func *f, *f_escape = NULL;
+ int key;
+
+ for (f = &kdb_poll_funcs[0]; ; ++f) {
+ if (*f == NULL) {
+ /* Reset NMI watchdog once per poll loop */
+ touch_nmi_watchdog();
+ f = &kdb_poll_funcs[0];
+ }
+ if (escape_delay == 2) {
+ *ped = '\0';
+ ped = escape_data;
+ --escape_delay;
+ }
+ if (escape_delay == 1) {
+ key = *ped++;
+ if (!*ped)
+ --escape_delay;
+ break;
+ }
+ key = (*f)();
+ if (key == -1) {
+ if (escape_delay) {
+ udelay(ESCAPE_UDELAY);
+ --escape_delay;
+ }
+ continue;
+ }
+ if (bufsize <= 2) {
+ if (key == '\r')
+ key = '\n';
+ *buffer++ = key;
+ *buffer = '\0';
+ return -1;
+ }
+ if (escape_delay == 0 && key == '\e') {
+ escape_delay = ESCAPE_DELAY;
+ ped = escape_data;
+ f_escape = f;
+ }
+ if (escape_delay) {
+ *ped++ = key;
+ if (f_escape != f) {
+ escape_delay = 2;
+ continue;
+ }
+ if (ped - escape_data == 1) {
+ /* \e */
+ continue;
+ } else if (ped - escape_data == 2) {
+ /* \e<something> */
+ if (key != '[')
+ escape_delay = 2;
+ continue;
+ } else if (ped - escape_data == 3) {
+ /* \e[<something> */
+ int mapkey = 0;
+ switch (key) {
+ case 'A': /* \e[A, up arrow */
+ mapkey = 16;
+ break;
+ case 'B': /* \e[B, down arrow */
+ mapkey = 14;
+ break;
+ case 'C': /* \e[C, right arrow */
+ mapkey = 6;
+ break;
+ case 'D': /* \e[D, left arrow */
+ mapkey = 2;
+ break;
+ case '1': /* dropthrough */
+ case '3': /* dropthrough */
+ /* \e[<1,3,4>], may be home, del, end */
+ case '4':
+ mapkey = -1;
+ break;
+ }
+ if (mapkey != -1) {
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ }
+ continue;
+ } else if (ped - escape_data == 4) {
+ /* \e[<1,3,4><something> */
+ int mapkey = 0;
+ if (key == '~') {
+ switch (escape_data[2]) {
+ case '1': /* \e[1~, home */
+ mapkey = 1;
+ break;
+ case '3': /* \e[3~, del */
+ mapkey = 4;
+ break;
+ case '4': /* \e[4~, end */
+ mapkey = 5;
+ break;
+ }
+ }
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ continue;
+ }
+ }
+ break; /* A key to process */
+ }
+ return key;
+}
+
+/*
+ * kdb_read
+ *
+ * This function reads a string of characters, terminated by
+ * a newline, or by reaching the end of the supplied buffer,
+ * from the current kernel debugger console device.
+ * Parameters:
+ * buffer - Address of character buffer to receive input characters.
+ * bufsize - size, in bytes, of the character buffer
+ * Returns:
+ * Returns a pointer to the buffer containing the received
+ * character string. This string will be terminated by a
+ * newline character.
+ * Locking:
+ * No locks are required to be held upon entry to this
+ * function. It is not reentrant - it relies on the fact
+ * that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2. A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key. Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+ char *cp = buffer;
+ char *bufend = buffer+bufsize-2; /* Reserve space for newline
+ * and null byte */
+ char *lastchar;
+ char *p_tmp;
+ char tmp;
+ static char tmpbuffer[CMD_BUFLEN];
+ int len = strlen(buffer);
+ int len_tmp;
+ int tab = 0;
+ int count;
+ int i;
+ int diag, dtab_count;
+ int key;
+
+
+ diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+ if (diag)
+ dtab_count = 30;
+
+ if (len > 0) {
+ cp += len;
+ if (*(buffer+len-1) == '\n')
+ cp--;
+ }
+
+ lastchar = cp;
+ *cp = '\0';
+ kdb_printf("%s", buffer);
+poll_again:
+ key = kdb_read_get_key(buffer, bufsize);
+ if (key == -1)
+ return buffer;
+ if (key != 9)
+ tab = 0;
+ switch (key) {
+ case 8: /* backspace */
+ if (cp > buffer) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp-1, tmpbuffer, lastchar - cp);
+ }
+ *(--lastchar) = '\0';
+ --cp;
+ kdb_printf("\b%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 13: /* enter */
+ *lastchar++ = '\n';
+ *lastchar++ = '\0';
+ kdb_printf("\n");
+ return buffer;
+ case 4: /* Del */
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+ memcpy(cp, tmpbuffer, lastchar - cp - 1);
+ *(--lastchar) = '\0';
+ kdb_printf("%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 1: /* Home */
+ if (cp > buffer) {
+ kdb_printf("\r");
+ kdb_printf(kdb_prompt_str);
+ cp = buffer;
+ }
+ break;
+ case 5: /* End */
+ if (cp < lastchar) {
+ kdb_printf("%s", cp);
+ cp = lastchar;
+ }
+ break;
+ case 2: /* Left */
+ if (cp > buffer) {
+ kdb_printf("\b");
+ --cp;
+ }
+ break;
+ case 14: /* Down */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 6: /* Right */
+ if (cp < lastchar) {
+ kdb_printf("%c", *cp);
+ ++cp;
+ }
+ break;
+ case 16: /* Up */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 9: /* Tab */
+ if (tab < 2)
+ ++tab;
+ p_tmp = buffer;
+ while (*p_tmp == ' ')
+ p_tmp++;
+ if (p_tmp > cp)
+ break;
+ memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+ *(tmpbuffer + (cp-p_tmp)) = '\0';
+ p_tmp = strrchr(tmpbuffer, ' ');
+ if (p_tmp)
+ ++p_tmp;
+ else
+ p_tmp = tmpbuffer;
+ len = strlen(p_tmp);
+ count = kallsyms_symbol_complete(p_tmp,
+ sizeof(tmpbuffer) -
+ (p_tmp - tmpbuffer));
+ if (tab == 2 && count > 0) {
+ kdb_printf("\n%d symbols are found.", count);
+ if (count > dtab_count) {
+ count = dtab_count;
+ kdb_printf(" But only first %d symbols will"
+ " be printed.\nYou can change the"
+ " environment variable DTABCOUNT.",
+ count);
+ }
+ kdb_printf("\n");
+ for (i = 0; i < count; i++) {
+ if (kallsyms_symbol_next(p_tmp, i) < 0)
+ break;
+ kdb_printf("%s ", p_tmp);
+ *(p_tmp + len) = '\0';
+ }
+ if (i >= dtab_count)
+ kdb_printf("...");
+ kdb_printf("\n");
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ } else if (tab != 2 && count > 0) {
+ len_tmp = strlen(p_tmp);
+ strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+ len_tmp = strlen(p_tmp);
+ strncpy(cp, p_tmp+len, len_tmp-len + 1);
+ len = len_tmp - len;
+ kdb_printf("%s", cp);
+ cp += len;
+ lastchar += len;
+ }
+ kdb_nextline = 1; /* reset output line number */
+ break;
+ default:
+ if (key >= 32 && lastchar < bufend) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp+1, tmpbuffer, lastchar - cp);
+ *++lastchar = '\0';
+ *cp = key;
+ kdb_printf("%s\r", cp);
+ ++cp;
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ } else {
+ *++lastchar = '\0';
+ *cp++ = key;
+ /* The kgdb transition check will hide
+ * printed characters if we think that
+ * kgdb is connecting, until the check
+ * fails */
+ if (!KDB_STATE(KGDB_TRANS))
+ kgdb_transition_check(buffer);
+ else
+ kdb_printf("%c", key);
+ }
+ /* Special escape to kgdb */
+ if (lastchar - buffer >= 5 &&
+ strcmp(lastchar - 5, "$?#3f") == 0) {
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ if (lastchar - buffer >= 14 &&
+ strcmp(lastchar - 14, "$qSupported#37") == 0) {
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB2);
+ return buffer;
+ }
+ }
+ break;
+ }
+ goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ * Print the prompt string and read a command from the
+ * input device.
+ *
+ * Parameters:
+ * buffer Address of buffer to receive command
+ * bufsize Size of buffer in bytes
+ * prompt Pointer to string to use as prompt string
+ * Returns:
+ * Pointer to command buffer.
+ * Locking:
+ * None.
+ * Remarks:
+ * For SMP kernels, the processor number will be
+ * substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+ if (prompt && kdb_prompt_str != prompt)
+ strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ kdb_printf(kdb_prompt_str);
+ kdb_nextline = 1; /* Prompt and input resets line number */
+ return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ * Get rid of any buffered console input.
+ *
+ * Parameters:
+ * none
+ * Returns:
+ * nothing
+ * Locking:
+ * none
+ * Remarks:
+ * Call this function whenever you want to flush input. If there is any
+ * outstanding input, it ignores all characters until there has been no
+ * data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+ get_char_func *f;
+ int res;
+ int flush_delay = 1;
+ while (flush_delay) {
+ flush_delay--;
+empty:
+ touch_nmi_watchdog();
+ for (f = &kdb_poll_funcs[0]; *f; ++f) {
+ res = (*f)();
+ if (res != -1) {
+ flush_delay = 1;
+ goto empty;
+ }
+ }
+ if (flush_delay)
+ mdelay(1);
+ }
+}
+
+/*
+ * kdb_printf
+ *
+ * Print a string to the output device(s).
+ *
+ * Parameters:
+ * printf-like format and optional args.
+ * Returns:
+ * 0
+ * Locking:
+ * None.
+ * Remarks:
+ * use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ * kdb output.
+ *
+ * If the user is doing a cmd args | grep srch
+ * then kdb_grepping_flag is set.
+ * In that case we need to accumulate full lines (ending in \n) before
+ * searching for the pattern.
+ */
+
+static char kdb_buffer[256]; /* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int size_avail;
+static int suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+ char firstchar, *cp;
+ int len1, len2;
+
+ /* not counting the newline at the end of "searched" */
+ len1 = strlen(searched)-1;
+ len2 = strlen(searchfor);
+ if (len1 < len2)
+ return 0;
+ if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+ return 0;
+ if (kdb_grep_leading) {
+ if (!strncmp(searched, searchfor, len2))
+ return 1;
+ } else if (kdb_grep_trailing) {
+ if (!strncmp(searched+len1-len2, searchfor, len2))
+ return 1;
+ } else {
+ firstchar = *searchfor;
+ cp = searched;
+ while ((cp = strchr(cp, firstchar))) {
+ if (!strncmp(cp, searchfor, len2))
+ return 1;
+ cp++;
+ }
+ }
+ return 0;
+}
+
+int vkdb_printf(const char *fmt, va_list ap)
+{
+ int diag;
+ int linecount;
+ int logging, saved_loglevel = 0;
+ int saved_trap_printk;
+ int got_printf_lock = 0;
+ int retlen = 0;
+ int fnd, len;
+ char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+ char *moreprompt = "more> ";
+ struct console *c = console_drivers;
+ static DEFINE_SPINLOCK(kdb_printf_lock);
+ unsigned long uninitialized_var(flags);
+
+ preempt_disable();
+ saved_trap_printk = kdb_trap_printk;
+ kdb_trap_printk = 0;
+
+ /* Serialize kdb_printf if multiple cpus try to write at once.
+ * But if any cpu goes recursive in kdb, just print the output,
+ * even if it is interleaved with any other text.
+ */
+ if (!KDB_STATE(PRINTF_LOCK)) {
+ KDB_STATE_SET(PRINTF_LOCK);
+ spin_lock_irqsave(&kdb_printf_lock, flags);
+ got_printf_lock = 1;
+ atomic_inc(&kdb_event);
+ } else {
+ __acquire(kdb_printf_lock);
+ }
+
+ diag = kdbgetintenv("LINES", &linecount);
+ if (diag || linecount <= 1)
+ linecount = 24;
+
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (diag)
+ logging = 0;
+
+ if (!kdb_grepping_flag || suspend_grep) {
+ /* normally, every vsnprintf starts a new buffer */
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ }
+ vsnprintf(next_avail, size_avail, fmt, ap);
+
+ /*
+ * If kdb_parse() found that the command was cmd xxx | grep yyy
+ * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+ *
+ * Accumulate the print data up to a newline before searching it.
+ * (vsnprintf does null-terminate the string that it generates)
+ */
+
+ /* skip the search if prints are temporarily unconditional */
+ if (!suspend_grep && kdb_grepping_flag) {
+ cp = strchr(kdb_buffer, '\n');
+ if (!cp) {
+ /*
+ * Special cases that don't end with newlines
+ * but should be written without one:
+ * The "[nn]kdb> " prompt should
+ * appear at the front of the buffer.
+ *
+ * The "[nn]more " prompt should also be
+ * (MOREPROMPT -> moreprompt)
+ * written * but we print that ourselves,
+ * we set the suspend_grep flag to make
+ * it unconditional.
+ *
+ */
+ if (next_avail == kdb_buffer) {
+ /*
+ * these should occur after a newline,
+ * so they will be at the front of the
+ * buffer
+ */
+ cp2 = kdb_buffer;
+ len = strlen(kdb_prompt_str);
+ if (!strncmp(cp2, kdb_prompt_str, len)) {
+ /*
+ * We're about to start a new
+ * command, so we can go back
+ * to normal mode.
+ */
+ kdb_grepping_flag = 0;
+ goto kdb_printit;
+ }
+ }
+ /* no newline; don't search/write the buffer
+ until one is there */
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+
+ /*
+ * The newline is present; print through it or discard
+ * it, depending on the results of the search.
+ */
+ cp++; /* to byte after the newline */
+ replaced_byte = *cp; /* remember what/where it was */
+ cphold = cp;
+ *cp = '\0'; /* end the string for our search */
+
+ /*
+ * We now have a newline at the end of the string
+ * Only continue with this output if it contains the
+ * search string.
+ */
+ fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+ if (!fnd) {
+ /*
+ * At this point the complete line at the start
+ * of kdb_buffer can be discarded, as it does
+ * not contain what the user is looking for.
+ * Shift the buffer left.
+ */
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+ /*
+ * at this point the string is a full line and
+ * should be printed, up to the null.
+ */
+ }
+kdb_printit:
+
+ /*
+ * Write to all consoles.
+ */
+ retlen = strlen(kdb_buffer);
+ if (!dbg_kdb_mode && kgdb_connected) {
+ gdbstub_msg_write(kdb_buffer, retlen);
+ } else {
+ if (!dbg_io_ops->is_console) {
+ len = strlen(kdb_buffer);
+ cp = kdb_buffer;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, kdb_buffer, retlen);
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+ }
+ if (logging) {
+ saved_loglevel = console_loglevel;
+ console_loglevel = 0;
+ printk(KERN_INFO "%s", kdb_buffer);
+ }
+
+ if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
+ kdb_nextline++;
+
+ /* check for having reached the LINES number of printed lines */
+ if (kdb_nextline == linecount) {
+ char buf1[16] = "";
+#if defined(CONFIG_SMP)
+ char buf2[32];
+#endif
+
+ /* Watch out for recursion here. Any routine that calls
+ * kdb_printf will come back through here. And kdb_read
+ * uses kdb_printf to echo on serial consoles ...
+ */
+ kdb_nextline = 1; /* In case of recursion */
+
+ /*
+ * Pause until cr.
+ */
+ moreprompt = kdbgetenv("MOREPROMPT");
+ if (moreprompt == NULL)
+ moreprompt = "more> ";
+
+#if defined(CONFIG_SMP)
+ if (strchr(moreprompt, '%')) {
+ sprintf(buf2, moreprompt, get_cpu());
+ put_cpu();
+ moreprompt = buf2;
+ }
+#endif
+
+ kdb_input_flush();
+ c = console_drivers;
+
+ if (!dbg_io_ops->is_console) {
+ len = strlen(moreprompt);
+ cp = moreprompt;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, moreprompt, strlen(moreprompt));
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+
+ if (logging)
+ printk("%s", moreprompt);
+
+ kdb_read(buf1, 2); /* '2' indicates to return
+ * immediately after getting one key. */
+ kdb_nextline = 1; /* Really set output line 1 */
+
+ /* empty and reset the buffer: */
+ kdb_buffer[0] = '\0';
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ /* user hit q or Q */
+ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+ KDB_STATE_CLEAR(PAGER);
+ /* end of command output; back to normal mode */
+ kdb_grepping_flag = 0;
+ kdb_printf("\n");
+ } else if (buf1[0] == ' ') {
+ kdb_printf("\n");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '\n') {
+ kdb_nextline = linecount - 1;
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] && buf1[0] != '\n') {
+ /* user hit something other than enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+ "prompt, input ignored\n");
+ } else if (kdb_grepping_flag) {
+ /* user hit enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\n");
+ }
+ kdb_input_flush();
+ }
+
+ /*
+ * For grep searches, shift the printed string left.
+ * replaced_byte contains the character that was overwritten with
+ * the terminating null, and cphold points to the null.
+ * Then adjust the notion of available space in the buffer.
+ */
+ if (kdb_grepping_flag && !suspend_grep) {
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ }
+
+kdb_print_out:
+ suspend_grep = 0; /* end of what may have been a recursive call */
+ if (logging)
+ console_loglevel = saved_loglevel;
+ if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+ got_printf_lock = 0;
+ spin_unlock_irqrestore(&kdb_printf_lock, flags);
+ KDB_STATE_CLEAR(PRINTF_LOCK);
+ atomic_dec(&kdb_event);
+ } else {
+ __release(kdb_printf_lock);
+ }
+ kdb_trap_printk = saved_trap_printk;
+ preempt_enable();
+ return retlen;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int r;
+
+ va_start(ap, fmt);
+ r = vkdb_printf(fmt, ap);
+ va_end(ap);
+
+ return r;
+}
+
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG 0x64 /* Status register (R) */
+#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+
+static int kbd_exists;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+ int scancode, scanstatus;
+ static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
+ static int shift_key; /* Shift next keypress */
+ static int ctrl_key;
+ u_short keychar;
+
+ if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+ (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+ kbd_exists = 0;
+ return -1;
+ }
+ kbd_exists = 1;
+
+ if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ return -1;
+
+ /*
+ * Fetch the scancode
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Ignore mouse events.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ return -1;
+
+ /*
+ * Ignore release, trigger on make
+ * (except for shift keys, where we want to
+ * keep the shift state so long as the key is
+ * held down).
+ */
+
+ if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+ /*
+ * Next key may use shift table
+ */
+ if ((scancode & 0x80) == 0)
+ shift_key = 1;
+ else
+ shift_key = 0;
+ return -1;
+ }
+
+ if ((scancode&0x7f) == 0x1d) {
+ /*
+ * Left ctrl key
+ */
+ if ((scancode & 0x80) == 0)
+ ctrl_key = 1;
+ else
+ ctrl_key = 0;
+ return -1;
+ }
+
+ if ((scancode & 0x80) != 0)
+ return -1;
+
+ scancode &= 0x7f;
+
+ /*
+ * Translate scancode
+ */
+
+ if (scancode == 0x3a) {
+ /*
+ * Toggle caps lock
+ */
+ shift_lock ^= 1;
+
+#ifdef KDB_BLINK_LED
+ kdb_toggleled(0x4);
+#endif
+ return -1;
+ }
+
+ if (scancode == 0x0e) {
+ /*
+ * Backspace
+ */
+ return 8;
+ }
+
+ /* Special Key */
+ switch (scancode) {
+ case 0xF: /* Tab */
+ return 9;
+ case 0x53: /* Del */
+ return 4;
+ case 0x47: /* Home */
+ return 1;
+ case 0x4F: /* End */
+ return 5;
+ case 0x4B: /* Left */
+ return 2;
+ case 0x48: /* Up */
+ return 16;
+ case 0x50: /* Down */
+ return 14;
+ case 0x4D: /* Right */
+ return 6;
+ }
+
+ if (scancode == 0xe0)
+ return -1;
+
+ /*
+ * For Japanese 86/106 keyboards
+ * See comment in drivers/char/pc_keyb.c.
+ * - Masahiro Adegawa
+ */
+ if (scancode == 0x73)
+ scancode = 0x59;
+ else if (scancode == 0x7d)
+ scancode = 0x7c;
+
+ if (!shift_lock && !shift_key && !ctrl_key) {
+ keychar = plain_map[scancode];
+ } else if ((shift_lock || shift_key) && key_maps[1]) {
+ keychar = key_maps[1][scancode];
+ } else if (ctrl_key && key_maps[4]) {
+ keychar = key_maps[4][scancode];
+ } else {
+ keychar = 0x0020;
+ kdb_printf("Unknown state/scancode (%d)\n", scancode);
+ }
+ keychar &= 0x0fff;
+ if (keychar == '\t')
+ keychar = ' ';
+ switch (KTYP(keychar)) {
+ case KT_LETTER:
+ case KT_LATIN:
+ if (isprint(keychar))
+ break; /* printable characters */
+ /* drop through */
+ case KT_SPEC:
+ if (keychar == K_ENTER)
+ break;
+ /* drop through */
+ default:
+ return -1; /* ignore unprintables */
+ }
+
+ if ((scancode & 0x7f) == 0x1c) {
+ /*
+ * enter key. All done. Absorb the release scancode.
+ */
+ while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ ;
+
+ /*
+ * Fetch the scancode
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ while (scanstatus & KBD_STAT_MOUSE_OBF) {
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+ }
+
+ if (scancode != 0x9c) {
+ /*
+ * Wasn't an enter-release, why not?
+ */
+ kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+ scancode, scanstatus);
+ }
+
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..28b844118bbd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2956 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu. Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1; /* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state; /* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+ CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[50];
+#define for_each_kdbcmd(cmd, num) \
+ for ((cmd) = kdb_base_commands, (num) = 0; \
+ num < kdb_max_commands; \
+ num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+
+typedef struct _kdbmsg {
+ int km_diag; /* kdb diagnostic */
+ char *km_msg; /* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+ { KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+ KDBMSG(NOTFOUND, "Command Not Found"),
+ KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+ KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+ "8 is only allowed on 64 bit systems"),
+ KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+ KDBMSG(NOTENV, "Cannot find environment variable"),
+ KDBMSG(NOENVVALUE, "Environment variable should have value"),
+ KDBMSG(NOTIMP, "Command not implemented"),
+ KDBMSG(ENVFULL, "Environment full"),
+ KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+ KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+ KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+ KDBMSG(BADMODE, "Invalid IDMODE"),
+ KDBMSG(BADINT, "Illegal numeric value"),
+ KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+ KDBMSG(BADREG, "Invalid register name"),
+ KDBMSG(BADCPUNUM, "Invalid cpu number"),
+ KDBMSG(BADLENGTH, "Invalid length field"),
+ KDBMSG(NOBP, "No Breakpoint exists"),
+ KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+
+
+/*
+ * Initial environment. This is all kept static and local to
+ * this file. We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables. The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+ "MOREPROMPT=[%d]more> ",
+#else
+ "PROMPT=kdb> ",
+ "MOREPROMPT=more> ",
+#endif
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ "BTARGS=9", /* 9 possible args in bt */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = (sizeof(__env) / sizeof(char *));
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+ struct task_struct *p = curr_task(cpu);
+#ifdef _TIF_MCA_INIT
+ if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+ p = krp->p;
+#endif
+ return p;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ * an environment variable.
+ * Parameters:
+ * match A character string representing an environment variable.
+ * Returns:
+ * NULL No environment variable matches 'match'
+ * char* Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+ char **ep = __env;
+ int matchlen = strlen(match);
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ char *e = *ep++;
+
+ if (!e)
+ continue;
+
+ if ((strncmp(match, e, matchlen) == 0)
+ && ((e[matchlen] == '\0')
+ || (e[matchlen] == '='))) {
+ char *cp = strchr(e, '=');
+ return cp ? ++cp : "";
+ }
+ }
+ return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ * environment entries.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long representation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ * We use a static environment buffer (envbuffer) to hold the values
+ * of dynamically generated environment variables (see kdb_set). Buffer
+ * space once allocated is never free'd, so over time, the amount of space
+ * (currently 512 bytes) will be exhausted if env variables are changed
+ * frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define KDB_ENVBUFSIZE 512
+ static char envbuffer[KDB_ENVBUFSIZE];
+ static int envbufsize;
+ char *ep = NULL;
+
+ if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+ ep = &envbuffer[envbufsize];
+ envbufsize += bytes;
+ }
+ return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ * long-valued environment variable.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+ char *ep;
+
+ ep = kdbgetenv(match);
+ if (!ep)
+ return KDB_NOTENV;
+ if (strlen(ep) == 0)
+ return KDB_NOENVVALUE;
+
+ *value = simple_strtoul(ep, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ * integer-valued environment variable.
+ * Parameters:
+ * match A character string representing an integer-valued env variable
+ * Outputs:
+ * *value the integer representation of the environment variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+ unsigned long val;
+ int diag;
+
+ diag = kdbgetulenv(match, &val);
+ if (!diag)
+ *value = (int) val;
+ return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ * unsigned long value.
+ * Parameters:
+ * arg A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of arg.
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+ char *endp;
+ unsigned long val;
+
+ val = simple_strtoul(arg, &endp, 0);
+
+ if (endp == arg) {
+ /*
+ * Also try base 16, for us folks too lazy to type the
+ * leading 0x...
+ */
+ val = simple_strtoul(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+ char *endp;
+ u64 val;
+
+ val = simple_strtoull(arg, &endp, 0);
+
+ if (endp == arg) {
+
+ val = simple_strtoull(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command. Alter an
+ * existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ /*
+ * we can be invoked two ways:
+ * set var=value argv[1]="var", argv[2]="value"
+ * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
+ * - if the latter, shift 'em down.
+ */
+ if (argc == 3) {
+ argv[2] = argv[3];
+ argc--;
+ }
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ /*
+ * Check for internal variables
+ */
+ if (strcmp(argv[1], "KDBDEBUG") == 0) {
+ unsigned int debugflags;
+ char *cp;
+
+ debugflags = simple_strtoul(argv[2], &cp, 0);
+ if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ kdb_printf("kdb: illegal debug flags '%s'\n",
+ argv[2]);
+ return 0;
+ }
+ kdb_flags = (kdb_flags &
+ ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ | (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+ return 0;
+ }
+
+ /*
+ * Tokenizer squashed the '=' sign. argv[1] is variable
+ * name, argv[2] = value.
+ */
+ varlen = strlen(argv[1]);
+ vallen = strlen(argv[2]);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+ ep[varlen+vallen+1] = '\0';
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], argv[1], varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+ if (!kdb_current_regs) {
+ kdb_printf("No current kdb registers."
+ " You may need to select another task\n");
+ return KDB_BADREG;
+ }
+ return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ * address-expression and returning the value of the expression,
+ * symbol name, and offset to the caller.
+ *
+ * The argument may consist of a numeric value (decimal or
+ * hexidecimal), a symbol name, a register name (preceeded by the
+ * percent sign), an environment variable with a numeric value
+ * (preceeded by a dollar sign) or a simple arithmetic expression
+ * consisting of a symbol name, +/-, and a numeric constant value
+ * (offset).
+ * Parameters:
+ * argc - count of arguments in argv
+ * argv - argument vector
+ * *nextarg - index to next unparsed argument in argv[]
+ * regs - Register state at time of KDB entry
+ * Outputs:
+ * *value - receives the value of the address-expression
+ * *offset - receives the offset specified, if any
+ * *name - receives the symbol name, if any
+ * *nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ * zero is returned on success, a kdb diagnostic code is
+ * returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+ unsigned long *value, long *offset,
+ char **name)
+{
+ unsigned long addr;
+ unsigned long off = 0;
+ int positive;
+ int diag;
+ int found = 0;
+ char *symname;
+ char symbol = '\0';
+ char *cp;
+ kdb_symtab_t symtab;
+
+ /*
+ * Process arguments which follow the following syntax:
+ *
+ * symbol | numeric-address [+/- numeric-offset]
+ * %register
+ * $environment-variable
+ */
+
+ if (*nextarg > argc)
+ return KDB_ARGCOUNT;
+
+ symname = (char *)argv[*nextarg];
+
+ /*
+ * If there is no whitespace between the symbol
+ * or address and the '+' or '-' symbols, we
+ * remember the character and replace it with a
+ * null so the symbol/value can be properly parsed
+ */
+ cp = strpbrk(symname, "+-");
+ if (cp != NULL) {
+ symbol = *cp;
+ *cp++ = '\0';
+ }
+
+ if (symname[0] == '$') {
+ diag = kdbgetulenv(&symname[1], &addr);
+ if (diag)
+ return diag;
+ } else if (symname[0] == '%') {
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+ /* Implement register values with % at a later time as it is
+ * arch optional.
+ */
+ return KDB_NOTIMP;
+ } else {
+ found = kdbgetsymval(symname, &symtab);
+ if (found) {
+ addr = symtab.sym_start;
+ } else {
+ diag = kdbgetularg(argv[*nextarg], &addr);
+ if (diag)
+ return diag;
+ }
+ }
+
+ if (!found)
+ found = kdbnearsym(addr, &symtab);
+
+ (*nextarg)++;
+
+ if (name)
+ *name = symname;
+ if (value)
+ *value = addr;
+ if (offset && name && *name)
+ *offset = addr - symtab.sym_start;
+
+ if ((*nextarg > argc)
+ && (symbol == '\0'))
+ return 0;
+
+ /*
+ * check for +/- and offset
+ */
+
+ if (symbol == '\0') {
+ if ((argv[*nextarg][0] != '+')
+ && (argv[*nextarg][0] != '-')) {
+ /*
+ * Not our argument. Return.
+ */
+ return 0;
+ } else {
+ positive = (argv[*nextarg][0] == '+');
+ (*nextarg)++;
+ }
+ } else
+ positive = (symbol == '+');
+
+ /*
+ * Now there must be an offset!
+ */
+ if ((*nextarg > argc)
+ && (symbol == '\0')) {
+ return KDB_INVADDRFMT;
+ }
+
+ if (!symbol) {
+ cp = (char *)argv[*nextarg];
+ (*nextarg)++;
+ }
+
+ diag = kdbgetularg(cp, &off);
+ if (diag)
+ return diag;
+
+ if (!positive)
+ off = -off;
+
+ if (offset)
+ *offset += off;
+
+ if (value)
+ *value += off;
+
+ return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+ int i;
+
+ if (diag >= 0) {
+ kdb_printf("no error detected (diagnostic is %d)\n", diag);
+ return;
+ }
+
+ for (i = 0; i < __nkdb_err; i++) {
+ if (kdbmsgs[i].km_diag == diag) {
+ kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+ return;
+ }
+ }
+
+ kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ * command which defines one command as a set of other commands,
+ * terminated by endefcmd. kdb_defcmd processes the initial
+ * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ * the following commands until 'endefcmd'.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+ int count;
+ int usable;
+ char *name;
+ char *usage;
+ char *help;
+ char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+ struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+ char **save_command = s->command;
+ if (strcmp(argv0, "endefcmd") == 0) {
+ defcmd_in_progress = 0;
+ if (!s->count)
+ s->usable = 0;
+ if (s->usable)
+ kdb_register(s->name, kdb_exec_defcmd,
+ s->usage, s->help, 0);
+ return 0;
+ }
+ if (!s->usable)
+ return KDB_NOTIMP;
+ s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+ if (!s->command) {
+ kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+ cmdstr);
+ s->usable = 0;
+ return KDB_NOTIMP;
+ }
+ memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+ s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+ kfree(save_command);
+ return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+ struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ if (defcmd_in_progress) {
+ kdb_printf("kdb: nested defcmd detected, assuming missing "
+ "endefcmd\n");
+ kdb_defcmd2("endefcmd", "endefcmd");
+ }
+ if (argc == 0) {
+ int i;
+ for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+ s->usage, s->help);
+ for (i = 0; i < s->count; ++i)
+ kdb_printf("%s", s->command[i]);
+ kdb_printf("endefcmd\n");
+ }
+ return 0;
+ }
+ if (argc != 3)
+ return KDB_ARGCOUNT;
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set) {
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n",
+ argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
+ }
+ memcpy(defcmd_set, save_defcmd_set,
+ defcmd_set_count * sizeof(*defcmd_set));
+ kfree(save_defcmd_set);
+ s = defcmd_set + defcmd_set_count;
+ memset(s, 0, sizeof(*s));
+ s->usable = 1;
+ s->name = kdb_strdup(argv[1], GFP_KDB);
+ s->usage = kdb_strdup(argv[2], GFP_KDB);
+ s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (s->usage[0] == '"') {
+ strcpy(s->usage, s->usage+1);
+ s->usage[strlen(s->usage)-1] = '\0';
+ }
+ if (s->help[0] == '"') {
+ strcpy(s->help, s->help+1);
+ s->help[strlen(s->help)-1] = '\0';
+ }
+ ++defcmd_set_count;
+ defcmd_in_progress = 1;
+ return 0;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ * defcmd name.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+ int i, ret;
+ struct defcmd_set *s;
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+ if (strcmp(s->name, argv[0]) == 0)
+ break;
+ }
+ if (i == defcmd_set_count) {
+ kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+ argv[0]);
+ return KDB_NOTIMP;
+ }
+ for (i = 0; i < s->count; ++i) {
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+ ret = kdb_parse(s->command[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT 32
+#define CMD_BUFLEN 200 /* kdb_printf: max printline
+ * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+ int len;
+ char *cp = (char *)str, *cp2;
+
+ /* sanity check: we should have been called with the \ first */
+ if (*cp != '|')
+ return;
+ cp++;
+ while (isspace(*cp))
+ cp++;
+ if (strncmp(cp, "grep ", 5)) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ cp += 5;
+ while (isspace(*cp))
+ cp++;
+ cp2 = strchr(cp, '\n');
+ if (cp2)
+ *cp2 = '\0'; /* remove the trailing newline */
+ len = strlen(cp);
+ if (len == 0) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ /* now cp points to a nonzero length search string */
+ if (*cp == '"') {
+ /* allow it be "x y z" by removing the "'s - there must
+ be two of them */
+ cp++;
+ cp2 = strchr(cp, '"');
+ if (!cp2) {
+ kdb_printf("invalid quoted string, see grephelp\n");
+ return;
+ }
+ *cp2 = '\0'; /* end the string where the 2nd " was */
+ }
+ kdb_grep_leading = 0;
+ if (*cp == '^') {
+ kdb_grep_leading = 1;
+ cp++;
+ }
+ len = strlen(cp);
+ kdb_grep_trailing = 0;
+ if (*(cp+len-1) == '$') {
+ kdb_grep_trailing = 1;
+ *(cp+len-1) = '\0';
+ }
+ len = strlen(cp);
+ if (!len)
+ return;
+ if (len >= GREP_LEN) {
+ kdb_printf("search string too long\n");
+ return;
+ }
+ strcpy(kdb_grep_string, cp);
+ kdb_grepping_flag++;
+ return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ * matching command and invoke the command function. This
+ * function may be called recursively, if it is, the second call
+ * will overwrite argv and cbuf. It is the caller's
+ * responsibility to save their argv if they recursively call
+ * kdb_parse().
+ * Parameters:
+ * cmdstr The input command line to be parsed.
+ * regs The registers at the time kdb was entered.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ * Limited to 20 tokens.
+ *
+ * Real rudimentary tokenization. Basically only whitespace
+ * is considered a token delimeter (but special consideration
+ * is taken of the '=' sign as used by the 'set' command).
+ *
+ * The algorithm used to tokenize the input string relies on
+ * there being at least one whitespace (or otherwise useless)
+ * character between tokens as the character immediately following
+ * the token is altered in-place to a null-byte to terminate the
+ * token string.
+ */
+
+#define MAXARGC 20
+
+int kdb_parse(const char *cmdstr)
+{
+ static char *argv[MAXARGC];
+ static int argc;
+ static char cbuf[CMD_BUFLEN+2];
+ char *cp;
+ char *cpp, quoted;
+ kdbtab_t *tp;
+ int i, escaped, ignore_errors = 0, check_grep;
+
+ /*
+ * First tokenize the command string.
+ */
+ cp = (char *)cmdstr;
+ kdb_grepping_flag = check_grep = 0;
+
+ if (KDB_FLAG(CMD_INTERRUPT)) {
+ /* Previous command was interrupted, newline must not
+ * repeat the command */
+ KDB_FLAG_CLEAR(CMD_INTERRUPT);
+ KDB_STATE_SET(PAGER);
+ argc = 0; /* no repeat */
+ }
+
+ if (*cp != '\n' && *cp != '\0') {
+ argc = 0;
+ cpp = cbuf;
+ while (*cp) {
+ /* skip whitespace */
+ while (isspace(*cp))
+ cp++;
+ if ((*cp == '\0') || (*cp == '\n') ||
+ (*cp == '#' && !defcmd_in_progress))
+ break;
+ /* special case: check for | grep pattern */
+ if (*cp == '|') {
+ check_grep++;
+ break;
+ }
+ if (cpp >= cbuf + CMD_BUFLEN) {
+ kdb_printf("kdb_parse: command buffer "
+ "overflow, command ignored\n%s\n",
+ cmdstr);
+ return KDB_NOTFOUND;
+ }
+ if (argc >= MAXARGC - 1) {
+ kdb_printf("kdb_parse: too many arguments, "
+ "command ignored\n%s\n", cmdstr);
+ return KDB_NOTFOUND;
+ }
+ argv[argc++] = cpp;
+ escaped = 0;
+ quoted = '\0';
+ /* Copy to next unquoted and unescaped
+ * whitespace or '=' */
+ while (*cp && *cp != '\n' &&
+ (escaped || quoted || !isspace(*cp))) {
+ if (cpp >= cbuf + CMD_BUFLEN)
+ break;
+ if (escaped) {
+ escaped = 0;
+ *cpp++ = *cp++;
+ continue;
+ }
+ if (*cp == '\\') {
+ escaped = 1;
+ ++cp;
+ continue;
+ }
+ if (*cp == quoted)
+ quoted = '\0';
+ else if (*cp == '\'' || *cp == '"')
+ quoted = *cp;
+ *cpp = *cp++;
+ if (*cpp == '=' && !quoted)
+ break;
+ ++cpp;
+ }
+ *cpp++ = '\0'; /* Squash a ws or '=' character */
+ }
+ }
+ if (!argc)
+ return 0;
+ if (check_grep)
+ parse_grep(cp);
+ if (defcmd_in_progress) {
+ int result = kdb_defcmd2(cmdstr, argv[0]);
+ if (!defcmd_in_progress) {
+ argc = 0; /* avoid repeat on endefcmd */
+ *(argv[0]) = '\0';
+ }
+ return result;
+ }
+ if (argv[0][0] == '-' && argv[0][1] &&
+ (argv[0][1] < '0' || argv[0][1] > '9')) {
+ ignore_errors = 1;
+ ++argv[0];
+ }
+
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+
+ if (tp->cmd_minlen
+ && (strlen(argv[0]) <= tp->cmd_minlen)) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ tp->cmd_minlen) == 0) {
+ break;
+ }
+ }
+
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
+ }
+ }
+
+ /*
+ * If we don't find a command by this name, see if the first
+ * few characters of this match any of the known commands.
+ * e.g., md1c20 should match md.
+ */
+ if (i == kdb_max_commands) {
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ strlen(tp->cmd_name)) == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (i < kdb_max_commands) {
+ int result;
+ KDB_STATE_SET(CMD);
+ result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ if (result && ignore_errors && result > KDB_CMD_GO)
+ result = 0;
+ KDB_STATE_CLEAR(CMD);
+ switch (tp->cmd_repeat) {
+ case KDB_REPEAT_NONE:
+ argc = 0;
+ if (argv[0])
+ *(argv[0]) = '\0';
+ break;
+ case KDB_REPEAT_NO_ARGS:
+ argc = 1;
+ if (argv[1])
+ *(argv[1]) = '\0';
+ break;
+ case KDB_REPEAT_WITH_ARGS:
+ break;
+ }
+ return result;
+ }
+
+ /*
+ * If the input with which we were presented does not
+ * map to an existing command, attempt to parse it as an
+ * address argument and display the result. Useful for
+ * obtaining the address of a variable, or the nearest symbol
+ * to an address contained in a register.
+ */
+ {
+ unsigned long value;
+ char *name = NULL;
+ long offset;
+ int nextarg = 0;
+
+ if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+ &value, &offset, &name)) {
+ return KDB_NOTFOUND;
+ }
+
+ kdb_printf("%s = ", argv[0]);
+ kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+ kdb_printf("\n");
+ return 0;
+ }
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P 16
+#define CTRL_N 14
+
+ /* initial situation */
+ if (cmd_head == cmd_tail)
+ return 0;
+ switch (*cmd) {
+ case CTRL_P:
+ if (cmdptr != cmd_tail)
+ cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ case CTRL_N:
+ if (cmdptr != cmd_head)
+ cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command. Reboot
+ * the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+ emergency_restart();
+ kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+ while (1)
+ cpu_relax();
+ /* NOTREACHED */
+ return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = 15;
+ kdb_trap_printk++;
+ show_regs(regs);
+ kdb_trap_printk--;
+ kdb_printf("\n");
+ console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+ kdb_current_task = p;
+
+ if (kdb_task_has_cpu(p)) {
+ kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+ return;
+ }
+ kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb. This routine is invoked on a
+ * specific processor, it is not global. The main kdb() routine
+ * ensures that only one processor at a time is in this routine.
+ * This code is called with the real reason code on the first
+ * entry to a kdb session, thereafter it is called with reason
+ * SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * regs The exception frame at time of fault/breakpoint.
+ * db_result Result code from the break or debug point.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ * KDB_CMD_GO User typed 'go'.
+ * KDB_CMD_CPU User switched to another cpu.
+ * KDB_CMD_SS Single step.
+ * KDB_CMD_SSB Single step until branch.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+ kdb_dbtrap_t db_result)
+{
+ char *cmdbuf;
+ int diag;
+ struct task_struct *kdb_current =
+ kdb_curr_task(raw_smp_processor_id());
+
+ KDB_DEBUG_STATE("kdb_local 1", reason);
+ kdb_go_count = 0;
+ if (reason == KDB_REASON_DEBUG) {
+ /* special case below */
+ } else {
+ kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ }
+
+ switch (reason) {
+ case KDB_REASON_DEBUG:
+ {
+ /*
+ * If re-entering kdb after a single step
+ * command, don't print the message.
+ */
+ switch (db_result) {
+ case KDB_DB_BPT:
+ kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ case KDB_DB_SSB:
+ /*
+ * In the midst of ssb command. Just return.
+ */
+ KDB_DEBUG_STATE("kdb_local 3", reason);
+ return KDB_CMD_SSB; /* Continue with SSB command */
+
+ break;
+ case KDB_DB_SS:
+ break;
+ case KDB_DB_SSBPT:
+ KDB_DEBUG_STATE("kdb_local 4", reason);
+ return 1; /* kdba_db_trap did the work */
+ default:
+ kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+ db_result);
+ break;
+ }
+
+ }
+ break;
+ case KDB_REASON_ENTER:
+ if (KDB_STATE(KEYBOARD))
+ kdb_printf("due to Keyboard Entry\n");
+ else
+ kdb_printf("due to KDB_ENTER()\n");
+ break;
+ case KDB_REASON_KEYBOARD:
+ KDB_STATE_SET(KEYBOARD);
+ kdb_printf("due to Keyboard Entry\n");
+ break;
+ case KDB_REASON_ENTER_SLAVE:
+ /* drop through, slaves only get released via cpu switch */
+ case KDB_REASON_SWITCH:
+ kdb_printf("due to cpu switch\n");
+ break;
+ case KDB_REASON_OOPS:
+ kdb_printf("Oops: %s\n", kdb_diemsg);
+ kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_NMI:
+ kdb_printf("due to NonMaskable Interrupt @ "
+ kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SSTEP:
+ case KDB_REASON_BREAK:
+ kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+ reason == KDB_REASON_BREAK ?
+ "Breakpoint" : "SS trap", instruction_pointer(regs));
+ /*
+ * Determine if this breakpoint is one that we
+ * are interested in.
+ */
+ if (db_result != KDB_DB_BPT) {
+ kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+ db_result);
+ KDB_DEBUG_STATE("kdb_local 6", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+ break;
+ case KDB_REASON_RECURSE:
+ kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ default:
+ kdb_printf("kdb: unexpected reason code: %d\n", reason);
+ KDB_DEBUG_STATE("kdb_local 8", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+
+ while (1) {
+ /*
+ * Initialize pager context.
+ */
+ kdb_nextline = 1;
+ KDB_STATE_CLEAR(SUPPRESS);
+
+ cmdbuf = cmd_cur;
+ *cmdbuf = '\0';
+ *(cmd_hist[cmd_head]) = '\0';
+
+ if (KDB_FLAG(ONLY_DO_DUMP)) {
+ /* kdb is off but a catastrophic error requires a dump.
+ * Take the dump and reboot.
+ * Turn on logging so the kdb output appears in the log
+ * buffer in the dump.
+ */
+ const char *setargs[] = { "set", "LOGGING", "1" };
+ kdb_set(2, setargs);
+ kdb_reboot(0, NULL);
+ /*NOTREACHED*/
+ }
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+ raw_smp_processor_id());
+#else
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+ if (defcmd_in_progress)
+ strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+ /*
+ * Fetch command from keyboard
+ */
+ cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+ if (*cmdbuf != '\n') {
+ if (*cmdbuf < 32) {
+ if (cmdptr == cmd_head) {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ *(cmd_hist[cmd_head] +
+ strlen(cmd_hist[cmd_head])-1) = '\0';
+ }
+ if (!handle_ctrl_cmd(cmdbuf))
+ *(cmd_cur+strlen(cmd_cur)-1) = '\0';
+ cmdbuf = cmd_cur;
+ goto do_full_getstr;
+ } else {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ }
+
+ cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+ if (cmd_head == cmd_tail)
+ cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+ }
+
+ cmdptr = cmd_head;
+ diag = kdb_parse(cmdbuf);
+ if (diag == KDB_NOTFOUND) {
+ kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+ diag = 0;
+ }
+ if (diag == KDB_CMD_GO
+ || diag == KDB_CMD_CPU
+ || diag == KDB_CMD_SS
+ || diag == KDB_CMD_SSB
+ || diag == KDB_CMD_KGDB)
+ break;
+
+ if (diag)
+ kdb_cmderror(diag);
+ }
+ KDB_DEBUG_STATE("kdb_local 9", diag);
+ return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ * for debugging.
+ * Inputs:
+ * text Identifies the debug point
+ * value Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+ kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+ text, raw_smp_processor_id(), value, kdb_initial_cpu,
+ kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ * controlling cpu, all cpus are in this loop. One cpu is in
+ * control and will issue the kdb prompt, the others will spin
+ * until 'go' or cpu switch.
+ *
+ * To get a consistent view of the kernel stacks for all
+ * processes, this routine is invoked from the main kdb code via
+ * an architecture specific routine. kdba_main_loop is
+ * responsible for making the kernel stacks consistent for all
+ * processes, there should be no difference between a blocked
+ * process and a running process as far as kdb is concerned.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * reason2 kdb's current reason code.
+ * Initially error but can change
+ * acording to kdb state.
+ * db_result Result code from break or debug point.
+ * regs The exception frame at time of fault/breakpoint.
+ * should always be valid.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+ int result = 1;
+ /* Stay in kdb() until 'go', 'ss[b]' or an error */
+ while (1) {
+ /*
+ * All processors except the one that is in control
+ * will spin here.
+ */
+ KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+ while (KDB_STATE(HOLD_CPU)) {
+ /* state KDB is turned off by kdb_cpu to see if the
+ * other cpus are still live, each cpu in this loop
+ * turns it back on.
+ */
+ if (!KDB_STATE(KDB))
+ KDB_STATE_SET(KDB);
+ }
+
+ KDB_STATE_CLEAR(SUPPRESS);
+ KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+ if (KDB_STATE(LEAVING))
+ break; /* Another cpu said 'go' */
+ /* Still using kdb, this processor is in control */
+ result = kdb_local(reason2, error, regs, db_result);
+ KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+ if (result == KDB_CMD_CPU)
+ break;
+
+ if (result == KDB_CMD_SS) {
+ KDB_STATE_SET(DOING_SS);
+ break;
+ }
+
+ if (result == KDB_CMD_SSB) {
+ KDB_STATE_SET(DOING_SS);
+ KDB_STATE_SET(DOING_SSB);
+ break;
+ }
+
+ if (result == KDB_CMD_KGDB) {
+ if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+ kdb_printf("Entering please attach debugger "
+ "or use $D#44+ or $3#33\n");
+ break;
+ }
+ if (result && result != 1 && result != KDB_CMD_GO)
+ kdb_printf("\nUnexpected kdb_local return code %d\n",
+ result);
+ KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+ break;
+ }
+ if (KDB_STATE(DOING_SS))
+ KDB_STATE_CLEAR(SSBPT);
+
+ return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ * mdr <addr arg>,<byte count>
+ * Inputs:
+ * addr Start address
+ * count Number of bytes
+ * Returns:
+ * Always 0. Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+ unsigned char c;
+ while (count--) {
+ if (kdb_getarea(c, addr))
+ return 0;
+ kdb_printf("%02x", c);
+ addr++;
+ }
+ kdb_printf("\n");
+ return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ * 'md8' 'mdr' and 'mds' commands.
+ *
+ * md|mds [<addr arg> [<line count> [<radix>]]]
+ * mdWcN [<addr arg> [<line count> [<radix>]]]
+ * where W = is the width (1, 2, 4 or 8) and N is the count.
+ * for eg., md1c20 reads 20 bytes, 1 at a time.
+ * mdr <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+ int symbolic, int nosect, int bytesperword,
+ int num, int repeat, int phys)
+{
+ /* print just one line of data */
+ kdb_symtab_t symtab;
+ char cbuf[32];
+ char *c = cbuf;
+ int i;
+ unsigned long word;
+
+ memset(cbuf, '\0', sizeof(cbuf));
+ if (phys)
+ kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+ else
+ kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+ for (i = 0; i < num && repeat--; i++) {
+ if (phys) {
+ if (kdb_getphysword(&word, addr, bytesperword))
+ break;
+ } else if (kdb_getword(&word, addr, bytesperword))
+ break;
+ kdb_printf(fmtstr, word);
+ if (symbolic)
+ kdbnearsym(word, &symtab);
+ else
+ memset(&symtab, 0, sizeof(symtab));
+ if (symtab.sym_name) {
+ kdb_symbol_print(word, &symtab, 0);
+ if (!nosect) {
+ kdb_printf("\n");
+ kdb_printf(" %s %s "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt, symtab.mod_name,
+ symtab.sec_name, symtab.sec_start,
+ symtab.sym_start, symtab.sym_end);
+ }
+ addr += bytesperword;
+ } else {
+ union {
+ u64 word;
+ unsigned char c[8];
+ } wc;
+ unsigned char *cp;
+#ifdef __BIG_ENDIAN
+ cp = wc.c + 8 - bytesperword;
+#else
+ cp = wc.c;
+#endif
+ wc.word = word;
+#define printable_char(c) \
+ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+ switch (bytesperword) {
+ case 8:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 4;
+ case 4:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 2;
+ case 2:
+ *c++ = printable_char(*cp++);
+ addr++;
+ case 1:
+ *c++ = printable_char(*cp++);
+ addr++;
+ break;
+ }
+#undef printable_char
+ }
+ }
+ kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+ " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+ static unsigned long last_addr;
+ static int last_radix, last_bytesperword, last_repeat;
+ int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+ int nosect = 0;
+ char fmtchar, fmtstr[64];
+ unsigned long addr;
+ unsigned long word;
+ long offset = 0;
+ int symbolic = 0;
+ int valid = 0;
+ int phys = 0;
+
+ kdbgetintenv("MDCOUNT", &mdcount);
+ kdbgetintenv("RADIX", &radix);
+ kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+ /* Assume 'md <addr>' and start with environment values */
+ repeat = mdcount * 16 / bytesperword;
+
+ if (strcmp(argv[0], "mdr") == 0) {
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ valid = 1;
+ } else if (isdigit(argv[0][2])) {
+ bytesperword = (int)(argv[0][2] - '0');
+ if (bytesperword == 0) {
+ bytesperword = last_bytesperword;
+ if (bytesperword == 0)
+ bytesperword = 4;
+ }
+ last_bytesperword = bytesperword;
+ repeat = mdcount * 16 / bytesperword;
+ if (!argv[0][3])
+ valid = 1;
+ else if (argv[0][3] == 'c' && argv[0][4]) {
+ char *p;
+ repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ valid = !*p;
+ }
+ last_repeat = repeat;
+ } else if (strcmp(argv[0], "md") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mds") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mdp") == 0) {
+ phys = valid = 1;
+ }
+ if (!valid)
+ return KDB_NOTFOUND;
+
+ if (argc == 0) {
+ if (last_addr == 0)
+ return KDB_ARGCOUNT;
+ addr = last_addr;
+ radix = last_radix;
+ bytesperword = last_bytesperword;
+ repeat = last_repeat;
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ }
+
+ if (argc) {
+ unsigned long val;
+ int diag, nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ if (argc > nextarg+2)
+ return KDB_ARGCOUNT;
+
+ if (argc >= nextarg) {
+ diag = kdbgetularg(argv[nextarg], &val);
+ if (!diag) {
+ mdcount = (int) val;
+ repeat = mdcount * 16 / bytesperword;
+ }
+ }
+ if (argc >= nextarg+1) {
+ diag = kdbgetularg(argv[nextarg+1], &val);
+ if (!diag)
+ radix = (int) val;
+ }
+ }
+
+ if (strcmp(argv[0], "mdr") == 0)
+ return kdb_mdr(addr, mdcount);
+
+ switch (radix) {
+ case 10:
+ fmtchar = 'd';
+ break;
+ case 16:
+ fmtchar = 'x';
+ break;
+ case 8:
+ fmtchar = 'o';
+ break;
+ default:
+ return KDB_BADRADIX;
+ }
+
+ last_radix = radix;
+
+ if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+
+ switch (bytesperword) {
+ case 8:
+ sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+ break;
+ case 4:
+ sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+ break;
+ case 2:
+ sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+ break;
+ case 1:
+ sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+ break;
+ default:
+ return KDB_BADWIDTH;
+ }
+
+ last_repeat = repeat;
+ last_bytesperword = bytesperword;
+
+ if (strcmp(argv[0], "mds") == 0) {
+ symbolic = 1;
+ /* Do not save these changes as last_*, they are temporary mds
+ * overrides.
+ */
+ bytesperword = KDB_WORD_SIZE;
+ repeat = mdcount;
+ kdbgetintenv("NOSECT", &nosect);
+ }
+
+ /* Round address down modulo BYTESPERWORD */
+
+ addr &= ~(bytesperword-1);
+
+ while (repeat > 0) {
+ unsigned long a;
+ int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+ if (phys) {
+ if (kdb_getphysword(&word, a, bytesperword)
+ || word)
+ break;
+ } else if (kdb_getword(&word, a, bytesperword) || word)
+ break;
+ }
+ n = min(num, repeat);
+ kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+ num, repeat, phys);
+ addr += bytesperword * n;
+ repeat -= n;
+ z = (z + num - 1) / num;
+ if (z > 2) {
+ int s = num * (z-2);
+ kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+ " zero suppressed\n",
+ addr, addr + bytesperword * s - 1);
+ addr += bytesperword * s;
+ repeat -= s;
+ }
+ }
+ last_addr = addr;
+
+ return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ * mm address-expression new-value
+ * Remarks:
+ * mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset = 0;
+ unsigned long contents;
+ int nextarg;
+ int width;
+
+ if (argv[0][2] && !isdigit(argv[0][2]))
+ return KDB_NOTFOUND;
+
+ if (argc < 2)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg > argc)
+ return KDB_ARGCOUNT;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg != argc + 1)
+ return KDB_ARGCOUNT;
+
+ width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+ diag = kdb_putword(addr, contents, width);
+ if (diag)
+ return diag;
+
+ kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+ return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ * go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+ unsigned long addr;
+ int diag;
+ int nextarg;
+ long offset;
+
+ if (argc == 1) {
+ if (raw_smp_processor_id() != kdb_initial_cpu) {
+ kdb_printf("go <address> must be issued from the "
+ "initial cpu, do cpu %d first\n",
+ kdb_initial_cpu);
+ return KDB_ARGCOUNT;
+ }
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg,
+ &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else if (argc) {
+ return KDB_ARGCOUNT;
+ }
+
+ diag = KDB_CMD_GO;
+ if (KDB_FLAG(CATASTROPHIC)) {
+ kdb_printf("Catastrophic error detected\n");
+ kdb_printf("kdb_continue_catastrophic=%d, ",
+ kdb_continue_catastrophic);
+ if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+ kdb_printf("type go a second time if you really want "
+ "to continue\n");
+ return 0;
+ }
+ if (kdb_continue_catastrophic == 2) {
+ kdb_printf("forcing reboot\n");
+ kdb_reboot(0, NULL);
+ }
+ kdb_printf("attempting to continue\n");
+ }
+ return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+ int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+ int i;
+ char *rname;
+ int rsize;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (len)
+ return len;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ rsize = dbg_reg_def[i].size * 2;
+ if (rsize > 16)
+ rsize = 2;
+ if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+ len = 0;
+ kdb_printf("\n");
+ }
+ if (len)
+ len += kdb_printf(" ");
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %02x", rname, reg8);
+ break;
+ case 16:
+ rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %04x", rname, reg16);
+ break;
+ case 32:
+ rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %08x", rname, reg32);
+ break;
+ case 64:
+ rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %016llx", rname, reg64);
+ break;
+ default:
+ len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+ }
+ }
+ kdb_printf("\n");
+#else
+ if (len)
+ return len;
+
+ kdb_dumpregs(kdb_current_regs);
+#endif
+ return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify) command.
+ * rm register-name new-contents
+ * Remarks:
+ * Allows register modification with the same restrictions as gdb
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+#if DBG_MAX_REG_NUM > 0
+ int diag;
+ const char *rname;
+ int i;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ /*
+ * Allow presence or absence of leading '%' symbol.
+ */
+ rname = argv[1];
+ if (*rname == '%')
+ rname++;
+
+ diag = kdbgetu64arg(argv[2], &reg64);
+ if (diag)
+ return diag;
+
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+
+ diag = KDB_BADREG;
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+ diag = 0;
+ break;
+ }
+ }
+ if (!diag) {
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ reg8 = reg64;
+ dbg_set_reg(i, &reg8, kdb_current_regs);
+ break;
+ case 16:
+ reg16 = reg64;
+ dbg_set_reg(i, &reg16, kdb_current_regs);
+ break;
+ case 32:
+ reg32 = reg64;
+ dbg_set_reg(i, &reg32, kdb_current_regs);
+ break;
+ case 64:
+ dbg_set_reg(i, &reg64, kdb_current_regs);
+ break;
+ }
+ }
+ return diag;
+#else
+ kdb_printf("ERROR: Register set currently not implemented\n");
+ return 0;
+#endif
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ * which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ * sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ kdb_trap_printk++;
+ __handle_sysrq(*argv[1], NULL, 0);
+ kdb_trap_printk--;
+
+ return 0;
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ * frame) command. This command takes an address and expects to
+ * find an exception frame at that address, formats and prints
+ * it.
+ * regs address-expression
+ * Remarks:
+ * Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset;
+ int nextarg;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ show_regs((struct pt_regs *)addr);
+ return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, kdb_modules, list) {
+
+ kdb_printf("%-20s%8u 0x%p ", mod->name,
+ mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%p", mod->module_core);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command. Display the
+ * current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+
+ if (KDB_DEBUG(MASK))
+ kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+ return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ * the contents of the syslog buffer.
+ * dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+ char *syslog_data[4], *start, *end, c = '\0', *p;
+ int diag, logging, logsize, lines = 0, adjust = 0, n;
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+ if (argc) {
+ char *cp;
+ lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ lines = 0;
+ if (argc > 1) {
+ adjust = simple_strtoul(argv[2], &cp, 0);
+ if (*cp || adjust < 0)
+ adjust = 0;
+ }
+ }
+
+ /* disable LOGGING if set */
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (!diag && logging) {
+ const char *setargs[] = { "set", "LOGGING", "0" };
+ kdb_set(2, setargs);
+ }
+
+ /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
+ * logical start, end+1. */
+ kdb_syslog_data(syslog_data);
+ if (syslog_data[2] == syslog_data[3])
+ return 0;
+ logsize = syslog_data[1] - syslog_data[0];
+ start = syslog_data[2];
+ end = syslog_data[3];
+#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
+ for (n = 0, p = start; p < end; ++p) {
+ c = *KDB_WRAP(p);
+ if (c == '\n')
+ ++n;
+ }
+ if (c != '\n')
+ ++n;
+ if (lines < 0) {
+ if (adjust >= n)
+ kdb_printf("buffer only contains %d lines, nothing "
+ "printed\n", n);
+ else if (adjust - lines >= n)
+ kdb_printf("buffer only contains %d lines, last %d "
+ "lines printed\n", n, n - adjust);
+ if (adjust) {
+ for (; start < end && adjust; ++start) {
+ if (*KDB_WRAP(start) == '\n')
+ --adjust;
+ }
+ if (start < end)
+ ++start;
+ }
+ for (p = start; p < end && lines; ++p) {
+ if (*KDB_WRAP(p) == '\n')
+ ++lines;
+ }
+ end = p;
+ } else if (lines > 0) {
+ int skip = n - (adjust + lines);
+ if (adjust >= n) {
+ kdb_printf("buffer only contains %d lines, "
+ "nothing printed\n", n);
+ skip = n;
+ } else if (skip < 0) {
+ lines += skip;
+ skip = 0;
+ kdb_printf("buffer only contains %d lines, first "
+ "%d lines printed\n", n, lines);
+ }
+ for (; start < end && skip; ++start) {
+ if (*KDB_WRAP(start) == '\n')
+ --skip;
+ }
+ for (p = start; p < end && lines; ++p) {
+ if (*KDB_WRAP(p) == '\n')
+ --lines;
+ }
+ end = p;
+ }
+ /* Do a line at a time (max 200 chars) to reduce protocol overhead */
+ c = '\n';
+ while (start != end) {
+ char buf[201];
+ p = buf;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ while (start < end && (c = *KDB_WRAP(start)) &&
+ (p - buf) < sizeof(buf)-1) {
+ ++start;
+ *p++ = c;
+ if (c == '\n')
+ break;
+ }
+ *p = '\0';
+ kdb_printf("%s", buf);
+ }
+ if (c != '\n')
+ kdb_printf("\n");
+
+ return 0;
+}
+#endif /* CONFIG_PRINTK */
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ * cpu [<cpunum>]
+ * Returns:
+ * KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+ int i, start_cpu, first_print = 1;
+ char state, prev_state = '?';
+
+ kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+ kdb_printf("Available cpus: ");
+ for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i)) {
+ state = 'F'; /* cpu is offline */
+ } else {
+ state = ' '; /* cpu is responding to kdb */
+ if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+ state = 'I'; /* idle task */
+ }
+ if (state != prev_state) {
+ if (prev_state != '?') {
+ if (!first_print)
+ kdb_printf(", ");
+ first_print = 0;
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ prev_state = state;
+ start_cpu = i;
+ }
+ }
+ /* print the trailing cpus, ignoring them if they are all offline */
+ if (prev_state != 'F') {
+ if (!first_print)
+ kdb_printf(", ");
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+ unsigned long cpunum;
+ int diag;
+
+ if (argc == 0) {
+ kdb_cpu_status();
+ return 0;
+ }
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetularg(argv[1], &cpunum);
+ if (diag)
+ return diag;
+
+ /*
+ * Validate cpunum
+ */
+ if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ return KDB_BADCPUNUM;
+
+ dbg_switch_cpu = cpunum;
+
+ /*
+ * Switch to other cpu
+ */
+ return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+ int idle = 0, daemon = 0;
+ unsigned long mask_I = kdb_task_state_string("I"),
+ mask_M = kdb_task_state_string("M");
+ unsigned long cpu;
+ const struct task_struct *p, *g;
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask_I))
+ ++idle;
+ }
+ kdb_do_each_thread(g, p) {
+ if (kdb_task_state(p, mask_M))
+ ++daemon;
+ } kdb_while_each_thread(g, p);
+ if (idle || daemon) {
+ if (idle)
+ kdb_printf("%d idle process%s (state I)%s\n",
+ idle, idle == 1 ? "" : "es",
+ daemon ? " and " : "");
+ if (daemon)
+ kdb_printf("%d sleeping system daemon (state M) "
+ "process%s", daemon,
+ daemon == 1 ? "" : "es");
+ kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+ }
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ * ps [DRSTCZEUIMA] All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+ int cpu;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return;
+
+ cpu = kdb_process_cpu(p);
+ kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ (void *)p, p->pid, p->parent->pid,
+ kdb_task_has_cpu(p), kdb_process_cpu(p),
+ kdb_task_state_char(p),
+ (void *)(&p->thread),
+ p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p->comm);
+ if (kdb_task_has_cpu(p)) {
+ if (!KDB_TSK(cpu)) {
+ kdb_printf(" Error: no saved data for this cpu\n");
+ } else {
+ if (KDB_TSK(cpu) != p)
+ kdb_printf(" Error: does not match running "
+ "process table (0x%p)\n", KDB_TSK(cpu));
+ }
+ }
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+ struct task_struct *g, *p;
+ unsigned long mask, cpu;
+
+ if (argc == 0)
+ kdb_ps_suppressed();
+ kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
+ (int)(2*sizeof(void *))+2, "Task Addr",
+ (int)(2*sizeof(void *))+2, "Thread");
+ mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ }
+ kdb_printf("\n");
+ /* Now the real tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ } kdb_while_each_thread(g, p);
+
+ return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ * the currently active process.
+ * pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+ struct task_struct *p;
+ unsigned long val;
+ int diag;
+
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ if (strcmp(argv[1], "R") == 0) {
+ p = KDB_TSK(kdb_initial_cpu);
+ } else {
+ diag = kdbgetularg(argv[1], &val);
+ if (diag)
+ return KDB_BADINT;
+
+ p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
+ if (!p) {
+ kdb_printf("No task with pid=%d\n", (pid_t)val);
+ return 0;
+ }
+ }
+ kdb_set_current_task(p);
+ }
+ kdb_printf("KDB current process is %s(pid=%d)\n",
+ kdb_current_task->comm,
+ kdb_current_task->pid);
+
+ return 0;
+}
+
+/*
+ * kdb_ll - This function implements the 'll' command which follows a
+ * linked list and executes an arbitrary command for each
+ * element.
+ */
+static int kdb_ll(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset = 0;
+ unsigned long va;
+ unsigned long linkoffset;
+ int nextarg;
+ const char *command;
+
+ if (argc != 3)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+
+ diag = kdbgetularg(argv[2], &linkoffset);
+ if (diag)
+ return diag;
+
+ /*
+ * Using the starting address as
+ * the first element in the list, and assuming that
+ * the list ends with a null pointer.
+ */
+
+ va = addr;
+ command = kdb_strdup(argv[3], GFP_KDB);
+ if (!command) {
+ kdb_printf("%s: cannot duplicate command\n", __func__);
+ return 0;
+ }
+ /* Recursive use of kdb_parse, do not use argv after this point */
+ argv = NULL;
+
+ while (va) {
+ char buf[80];
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
+ diag = kdb_parse(buf);
+ if (diag)
+ return diag;
+
+ addr = va + linkoffset;
+ if (kdb_getword(&va, addr, sizeof(va)))
+ return 0;
+ }
+ kfree(command);
+
+ return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+ return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+ kdbtab_t *kt;
+ int i;
+
+ kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+ kdb_printf("-----------------------------"
+ "-----------------------------\n");
+ for_each_kdbcmd(kt, i) {
+ if (kt->cmd_name)
+ kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
+ kt->cmd_usage, kt->cmd_help);
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+ long sig, pid;
+ char *endp;
+ struct task_struct *p;
+ struct siginfo info;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ sig = simple_strtol(argv[1], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (sig >= 0) {
+ kdb_printf("Invalid signal parameter.<-signal>\n");
+ return 0;
+ }
+ sig = -sig;
+
+ pid = simple_strtol(argv[2], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (pid <= 0) {
+ kdb_printf("Process ID must be large than 0.\n");
+ return 0;
+ }
+
+ /* Find the process. */
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (!p) {
+ kdb_printf("The specified process isn't found.\n");
+ return 0;
+ }
+ p = p->group_leader;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = pid; /* same capabilities as process being signalled */
+ info.si_uid = 0; /* kdb has root authority */
+ kdb_send_sig_info(p, &info);
+ return 0;
+}
+
+struct kdb_tm {
+ int tm_sec; /* seconds */
+ int tm_min; /* minutes */
+ int tm_hour; /* hours */
+ int tm_mday; /* day of the month */
+ int tm_mon; /* month */
+ int tm_year; /* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+ /* This will work from 1970-2099, 2100 is not a leap year */
+ static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+ 31, 30, 31, 30, 31 };
+ memset(tm, 0, sizeof(*tm));
+ tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
+ tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+ (2 * 365 + 1); /* shift base from 1970 to 1968 */
+ tm->tm_min = tm->tm_sec / 60 % 60;
+ tm->tm_hour = tm->tm_sec / 60 / 60;
+ tm->tm_sec = tm->tm_sec % 60;
+ tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+ tm->tm_mday %= (4*365+1);
+ mon_day[1] = 29;
+ while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+ tm->tm_mday -= mon_day[tm->tm_mon];
+ if (++tm->tm_mon == 12) {
+ tm->tm_mon = 0;
+ ++tm->tm_year;
+ mon_day[1] = 28;
+ }
+ }
+ ++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+ struct timespec uptime;
+ do_posix_clock_monotonic_gettime(&uptime);
+ memset(val, 0, sizeof(*val));
+ val->uptime = uptime.tv_sec;
+ val->loads[0] = avenrun[0];
+ val->loads[1] = avenrun[1];
+ val->loads[2] = avenrun[2];
+ val->procs = nr_threads-1;
+ si_meminfo(val);
+
+ return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+ struct timespec now;
+ struct kdb_tm tm;
+ struct sysinfo val;
+
+ if (argc)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
+ kdb_printf("release %s\n", init_uts_ns.name.release);
+ kdb_printf("version %s\n", init_uts_ns.name.version);
+ kdb_printf("machine %s\n", init_uts_ns.name.machine);
+ kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
+ kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+ kdb_printf("ccversion %s\n", __stringify(CCVERSION));
+
+ now = __current_kernel_time();
+ kdb_gmtime(&now, &tm);
+ kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ "tz_minuteswest %d\n",
+ 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec,
+ sys_tz.tz_minuteswest);
+
+ kdb_sysinfo(&val);
+ kdb_printf("uptime ");
+ if (val.uptime > (24*60*60)) {
+ int days = val.uptime / (24*60*60);
+ val.uptime %= (24*60*60);
+ kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ }
+ kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+ /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+ kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
+ LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+ LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+ LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+ /* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
+ "Buffers: %8lu kB\n",
+ val.totalram, val.freeram, val.bufferram);
+ return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+ char buf[256], fmtstr[64];
+ kdb_symtab_t symtab;
+ cpumask_t suppress = CPU_MASK_NONE;
+ int cpu, diag;
+ unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
+
+ if (argc < 1 || argc > 3)
+ return KDB_ARGCOUNT;
+
+ snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+ if (!kdbgetsymval(buf, &symtab)) {
+ kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+ return KDB_BADADDR;
+ }
+ if (argc >= 2) {
+ diag = kdbgetularg(argv[2], &bytesperword);
+ if (diag)
+ return diag;
+ }
+ if (!bytesperword)
+ bytesperword = KDB_WORD_SIZE;
+ else if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+ sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+ if (argc >= 3) {
+ diag = kdbgetularg(argv[3], &whichcpu);
+ if (diag)
+ return diag;
+ if (!cpu_online(whichcpu)) {
+ kdb_printf("cpu %ld is not online\n", whichcpu);
+ return KDB_BADCPUNUM;
+ }
+ }
+
+ /* Most architectures use __per_cpu_offset[cpu], some use
+ * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+ */
+#ifdef __per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+
+ for_each_online_cpu(cpu) {
+ if (whichcpu != ~0UL && whichcpu != cpu)
+ continue;
+ addr = symtab.sym_start + KDB_PCU(cpu);
+ diag = kdb_getword(&val, addr, bytesperword);
+ if (diag) {
+ kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+ "read, diag=%d\n", cpu, addr, diag);
+ continue;
+ }
+#ifdef CONFIG_SMP
+ if (!val) {
+ cpu_set(cpu, suppress);
+ continue;
+ }
+#endif /* CONFIG_SMP */
+ kdb_printf("%5d ", cpu);
+ kdb_md_line(fmtstr, addr,
+ bytesperword == KDB_WORD_SIZE,
+ 1, bytesperword, 1, 1, 0);
+ }
+ if (cpus_weight(suppress) == 0)
+ return 0;
+ kdb_printf("Zero suppressed cpu(s):");
+ for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
+ cpu = next_cpu(cpu, suppress)) {
+ kdb_printf(" %d", cpu);
+ if (cpu == num_possible_cpus() - 1 ||
+ next_cpu(cpu, suppress) != cpu + 1)
+ continue;
+ while (cpu < num_possible_cpus() &&
+ next_cpu(cpu, suppress) == cpu + 1)
+ ++cpu;
+ kdb_printf("-%d", cpu);
+ }
+ kdb_printf("\n");
+
+#undef KDB_PCU
+
+ return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+ kdb_printf("Usage of cmd args | grep pattern:\n");
+ kdb_printf(" Any command's output may be filtered through an ");
+ kdb_printf("emulated 'pipe'.\n");
+ kdb_printf(" 'grep' is just a key word.\n");
+ kdb_printf(" The pattern may include a very limited set of "
+ "metacharacters:\n");
+ kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
+ kdb_printf(" And if there are spaces in the pattern, you may "
+ "quote it:\n");
+ kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+ " or \"^pat tern$\"\n");
+ return 0;
+}
+
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ * debugger command.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * repeat Does the command auto repeat on enter?
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50 /* arbitrary */
+int kdb_register_repeat(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_repeat_t repeat)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * Brute force method to determine duplicates
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kdb_printf("Duplicate kdb command registered: "
+ "%s, func %p help %s\n", cmd, func, help);
+ return 1;
+ }
+ }
+
+ /*
+ * Insert command into first available location in table
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name == NULL)
+ break;
+ }
+
+ if (i >= kdb_max_commands) {
+ kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+ kdb_command_extend) * sizeof(*new), GFP_KDB);
+ if (!new) {
+ kdb_printf("Could not allocate new kdb_command "
+ "table\n");
+ return 1;
+ }
+ if (kdb_commands) {
+ memcpy(new, kdb_commands,
+ kdb_max_commands * sizeof(*new));
+ kfree(kdb_commands);
+ }
+ memset(new + kdb_max_commands, 0,
+ kdb_command_extend * sizeof(*new));
+ kdb_commands = new;
+ kp = kdb_commands + kdb_max_commands;
+ kdb_max_commands += kdb_command_extend;
+ }
+
+ kp->cmd_name = cmd;
+ kp->cmd_func = func;
+ kp->cmd_usage = usage;
+ kp->cmd_help = help;
+ kp->cmd_flags = 0;
+ kp->cmd_minlen = minlen;
+ kp->cmd_repeat = repeat;
+
+ return 0;
+}
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ * not need to specify a repeat state. Equivalent to
+ * kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen)
+{
+ return kdb_register_repeat(cmd, func, usage, help, minlen,
+ KDB_REPEAT_NONE);
+}
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ * debugger command. It is generally called when a module which
+ * implements kdb commands is unloaded.
+ * Inputs:
+ * cmd Command name
+ * Returns:
+ * zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * find the command.
+ */
+ for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kp->cmd_name = NULL;
+ return 0;
+ }
+ }
+
+ /* Couldn't find it. */
+ return 1;
+}
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ int i;
+ kdbtab_t *kp;
+
+ for_each_kdbcmd(kp, i)
+ kp->cmd_name = NULL;
+
+ kdb_register_repeat("md", kdb_md, "<vaddr>",
+ "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+ KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("rd", kdb_rd, "",
+ "Display Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+ "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ "Backtrace process given its struct task address", 0,
+ KDB_REPEAT_NONE);
+ kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
+ "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("env", kdb_env, "",
+ "Show environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("set", kdb_set, "",
+ "Set environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("help", kdb_help, "",
+ "Display Help Message", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("?", kdb_help, "",
+ "Display Help Message", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+ kdb_register_repeat("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+ kdb_register_repeat("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+ kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+ kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("summary", kdb_summary, "",
+ "Summarize the system", 4, KDB_REPEAT_NONE);
+ kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+ "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+ kdb_register_repeat("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+
+/* Execute any commands defined in kdb_cmds. */
+static void __init kdb_cmd_init(void)
+{
+ int i, diag;
+ for (i = 0; kdb_cmds[i]; ++i) {
+ diag = kdb_parse(kdb_cmds[i]);
+ if (diag)
+ kdb_printf("kdb command %s failed, kdb diag %d\n",
+ kdb_cmds[i], diag);
+ }
+ if (defcmd_in_progress) {
+ kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+ kdb_parse("endefcmd");
+ }
+}
+
+/* Intialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+ static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+ int i;
+
+ if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+ return;
+ for (i = kdb_init_lvl; i < lvl; i++) {
+ switch (i) {
+ case KDB_NOT_INITIALIZED:
+ kdb_inittab(); /* Initialize Command Table */
+ kdb_initbptab(); /* Initialize Breakpoints */
+ break;
+ case KDB_INIT_EARLY:
+ kdb_cmd_init(); /* Build kdb_cmds tables */
+ break;
+ }
+ }
+ kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..c438f545a321
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,298 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Error codes. Must not overlap with command codes. */
+#define KDB_NOTFOUND (-1)
+#define KDB_ARGCOUNT (-2)
+#define KDB_BADWIDTH (-3)
+#define KDB_BADRADIX (-4)
+#define KDB_NOTENV (-5)
+#define KDB_NOENVVALUE (-6)
+#define KDB_NOTIMP (-7)
+#define KDB_ENVFULL (-8)
+#define KDB_ENVBUFFULL (-9)
+#define KDB_TOOMANYBPT (-10)
+#define KDB_TOOMANYDBREGS (-11)
+#define KDB_DUPBPT (-12)
+#define KDB_BPTNOTFOUND (-13)
+#define KDB_BADMODE (-14)
+#define KDB_BADINT (-15)
+#define KDB_INVADDRFMT (-16)
+#define KDB_BADREG (-17)
+#define KDB_BADCPUNUM (-18)
+#define KDB_BADLENGTH (-19)
+#define KDB_NOBP (-20)
+#define KDB_BADADDR (-21)
+
+/* Kernel Debugger Command codes. Must not overlap with error codes. */
+#define KDB_CMD_GO (-1001)
+#define KDB_CMD_CPU (-1002)
+#define KDB_CMD_SS (-1003)
+#define KDB_CMD_SSB (-1004)
+#define KDB_CMD_KGDB (-1005)
+#define KDB_CMD_KGDB2 (-1006)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
+#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag) (kdb_flags & \
+ (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+ kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%08lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%08lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%08x"
+#define kdb_f_count_fmt "%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%016lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%016lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%016x"
+#define kdb_f_count_fmt "%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT 16
+
+/* Maximum number of arguments to a function */
+#define KDB_MAXARGS 16
+
+typedef enum {
+ KDB_REPEAT_NONE = 0, /* Do not repeat this command */
+ KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
+ KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
+} kdb_repeat_t;
+
+typedef int (*kdb_func_t)(int, const char **);
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+ unsigned long value; /* Address of symbol */
+ const char *mod_name; /* Module containing symbol or
+ * "kernel" */
+ unsigned long mod_start;
+ unsigned long mod_end;
+ const char *sec_name; /* Section containing symbol */
+ unsigned long sec_start;
+ unsigned long sec_end;
+ const char *sym_name; /* Full symbol name, including
+ * any version */
+ unsigned long sym_start;
+ unsigned long sym_end;
+ } kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_register(char *, kdb_func_t, char *, char *, short);
+extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
+ short, kdb_repeat_t);
+extern int kdb_unregister(char *);
+
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers. The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+ unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern char *kdbgetenv(const char *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+ long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
+#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
+#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
+ * kdb control */
+#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
+#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
+ * DOING_SS is also set */
+#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
+ * after one ss, independent of
+ * DOING_SS */
+#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
+#define KDB_STATE_PAGER 0x00000400 /* pager is available */
+#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
+ * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
+ * adjusted */
+#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
+#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
+ * keyboard on this cpu */
+#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
+#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
+#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
+#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
+ * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+ unsigned long bp_addr; /* Address breakpoint is present at */
+ unsigned int bp_free:1; /* This entry is available */
+ unsigned int bp_enabled:1; /* Breakpoint is active in register */
+ unsigned int bp_type:4; /* Uses hardware register */
+ unsigned int bp_installed:1; /* Breakpoint is installed */
+ unsigned int bp_delay:1; /* Do delayed bp handling */
+ unsigned int bp_delayed:1; /* Delayed breakpoint */
+ unsigned int bph_length; /* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+ char *cmd_name; /* Command name */
+ kdb_func_t cmd_func; /* Function to execute command */
+ char *cmd_usage; /* Usage String for this command */
+ char *cmd_help; /* Help message for this command */
+ short cmd_flags; /* Parsing flags */
+ short cmd_minlen; /* Minimum legal # command
+ * chars required */
+ kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **); /* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+ KDB_DB_BPT, /* Breakpoint */
+ KDB_DB_SS, /* Single-step trap */
+ KDB_DB_SSB, /* Single step to branch */
+ KDB_DB_SSBPT, /* Single step over breakpoint */
+ KDB_DB_NOBPT /* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+ int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern void kdb_syslog_data(char *syslog_data[]);
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+ unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern const char *kdb_walk_kallsyms(loff_t *pos);
+extern char *kdb_getstr(char *, size_t, char *);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB 0x0001 /* Space before string */
+#define KDB_SP_SPACEA 0x0002 /* Space after string */
+#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
+#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
+#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
+#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ * symname Character string containing symbol name
+ * symtab Structure to receive results
+ * Returns:
+ * 0 Symbol not found, symtab zero filled
+ * 1 Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ symtab);
+ memset(symtab, 0, sizeof(*symtab));
+ symtab->sym_start = kallsyms_lookup_name(symname);
+ if (symtab->sym_start) {
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 1, "
+ "symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
+ return 1;
+ }
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 0\n");
+ return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100]; /* arbitrary size */
+
+/*
+ * kdbnearsym - Return the name of the symbol with the nearest address
+ * less than 'addr'.
+ *
+ * Parameters:
+ * addr Address to check for symbol near
+ * symtab Structure to receive results
+ * Returns:
+ * 0 No sections contain this address, symtab zero filled
+ * 1 Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ * 2.6 kallsyms has a "feature" where it unpacks the name into a
+ * string. If that string is reused before the caller expects it
+ * then the caller sees its string change without warning. To
+ * avoid cluttering up the main kdb code with lots of kdb_strdup,
+ * tests and kfree calls, kdbnearsym maintains an LRU list of the
+ * last few unique strings. The list is sized large enough to
+ * hold active strings, no kdb caller of kdbnearsym makes more
+ * than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+ int ret = 0;
+ unsigned long symbolsize;
+ unsigned long offset;
+#define knt1_size 128 /* must be >= kallsyms table size */
+ char *knt1 = NULL;
+
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ memset(symtab, 0, sizeof(*symtab));
+
+ if (addr < 4096)
+ goto out;
+ knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+ if (!knt1) {
+ kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+ addr);
+ goto out;
+ }
+ symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+ (char **)(&symtab->mod_name), knt1);
+ if (offset > 8*1024*1024) {
+ symtab->sym_name = NULL;
+ addr = offset = symbolsize = 0;
+ }
+ symtab->sym_start = addr - offset;
+ symtab->sym_end = symtab->sym_start + symbolsize;
+ ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+ if (ret) {
+ int i;
+ /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
+ * set but the buffer passed into kallsyms_lookup is not used,
+ * so it contains garbage. The caller has to work out which
+ * buffer needs to be saved.
+ *
+ * What was Rusty smoking when he wrote that code?
+ */
+ if (symtab->sym_name != knt1) {
+ strncpy(knt1, symtab->sym_name, knt1_size);
+ knt1[knt1_size-1] = '\0';
+ }
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i] &&
+ strcmp(kdb_name_table[i], knt1) == 0)
+ break;
+ }
+ if (i >= ARRAY_SIZE(kdb_name_table)) {
+ debug_kfree(kdb_name_table[0]);
+ memcpy(kdb_name_table, kdb_name_table+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-1));
+ } else {
+ debug_kfree(knt1);
+ knt1 = kdb_name_table[i];
+ memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-i-1));
+ }
+ i = ARRAY_SIZE(kdb_name_table) - 1;
+ kdb_name_table[i] = knt1;
+ symtab->sym_name = kdb_name_table[i];
+ knt1 = NULL;
+ }
+
+ if (symtab->mod_name == NULL)
+ symtab->mod_name = "kernel";
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+ "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ symtab->sym_start, symtab->mod_name, symtab->sym_name,
+ symtab->sym_name);
+
+out:
+ debug_kfree(knt1);
+ return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i]) {
+ debug_kfree(kdb_name_table[i]);
+ kdb_name_table[i] = NULL;
+ }
+ }
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * max_len maximum length that can be returned
+ * Returns:
+ * Number of symbols which match the given prefix.
+ * Notes:
+ * prefix_name is changed to contain the longest unique prefix that
+ * starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+ loff_t pos = 0;
+ int prefix_len = strlen(prefix_name), prev_len = 0;
+ int i, number = 0;
+ const char *name;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strcpy(ks_namebuf, name);
+ /* Work out the longest name that matches the prefix */
+ if (++number == 1) {
+ prev_len = min_t(int, max_len-1,
+ strlen(ks_namebuf));
+ memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+ ks_namebuf_prev[prev_len] = '\0';
+ continue;
+ }
+ for (i = 0; i < prev_len; i++) {
+ if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+ prev_len = i;
+ ks_namebuf_prev[i] = '\0';
+ break;
+ }
+ }
+ }
+ }
+ if (prev_len > prefix_len)
+ memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+ return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * flag 0 means search from the head, 1 means continue search.
+ * Returns:
+ * 1 if a symbol matches the given prefix.
+ * 0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+ int prefix_len = strlen(prefix_name);
+ static loff_t pos;
+ const char *name;
+
+ if (!flag)
+ pos = 0;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strncpy(prefix_name, name, strlen(name)+1);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ * addr Address to be printed.
+ * symtab Address of symbol data, if NULL this routine does its
+ * own lookup.
+ * punc Punctuation for string, bit field.
+ * Remarks:
+ * The string and its punctuation is only printed if the address
+ * is inside the kernel, except that the value is always printed
+ * when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+ unsigned int punc)
+{
+ kdb_symtab_t symtab, *symtab_p2;
+ if (symtab_p) {
+ symtab_p2 = (kdb_symtab_t *)symtab_p;
+ } else {
+ symtab_p2 = &symtab;
+ kdbnearsym(addr, symtab_p2);
+ }
+ if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+ return;
+ if (punc & KDB_SP_SPACEB)
+ kdb_printf(" ");
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(kdb_machreg_fmt0, addr);
+ if (symtab_p2->sym_name) {
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(" ");
+ if (punc & KDB_SP_PAREN)
+ kdb_printf("(");
+ if (strcmp(symtab_p2->mod_name, "kernel"))
+ kdb_printf("[%s]", symtab_p2->mod_name);
+ kdb_printf("%s", symtab_p2->sym_name);
+ if (addr != symtab_p2->sym_start)
+ kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+ if (punc & KDB_SP_SYMSIZE)
+ kdb_printf("/0x%lx",
+ symtab_p2->sym_end - symtab_p2->sym_start);
+ if (punc & KDB_SP_PAREN)
+ kdb_printf(")");
+ }
+ if (punc & KDB_SP_SPACEA)
+ kdb_printf(" ");
+ if (punc & KDB_SP_NEWLINE)
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ * str The string to duplicate.
+ * type Flags to kmalloc for the new string.
+ * Returns:
+ * Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ * This is not in lib/string.c because it uses kmalloc which is not
+ * available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+ int n = strlen(str)+1;
+ char *s = kmalloc(n, type);
+ if (!s)
+ return NULL;
+ return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data. The kdb equivalent of
+ * copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * res Pointer to the area to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+ int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data. The kdb equivalent of
+ * copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * addr Address of the area to write to.
+ * res Pointer to the area holding the data.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+ int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * address is in range, use kmap_atomic() to get data
+ * similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * res Pointer to the word to receive the result
+ * addr Physical address of the area to copy
+ * size Size of the area
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+ unsigned long pfn;
+ void *vaddr;
+ struct page *page;
+
+ pfn = (addr >> PAGE_SHIFT);
+ if (!pfn_valid(pfn))
+ return 1;
+ page = pfn_to_page(pfn);
+ vaddr = kmap_atomic(page, KM_KDB);
+ memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+ kunmap_atomic(vaddr, KM_KDB);
+
+ return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+
+ switch (size) {
+ case 1:
+ diag = kdb_getphys(&w1, addr, sizeof(w1));
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getphys(&w2, addr, sizeof(w2));
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getphys(&w4, addr, sizeof(w4));
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getphys(&w8, addr, sizeof(w8));
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
+ * data as numbers.
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+ switch (size) {
+ case 1:
+ diag = kdb_getarea(w1, addr);
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getarea(w2, addr);
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getarea(w4, addr);
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getarea(w8, addr);
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value. Unlike kdb_putarea, this
+ * treats data as numbers.
+ * Inputs:
+ * addr Address of the area to write to..
+ * word The value to set.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ switch (size) {
+ case 1:
+ w1 = word;
+ diag = kdb_putarea(addr, w1);
+ break;
+ case 2:
+ w2 = word;
+ diag = kdb_putarea(addr, w2);
+ break;
+ case 4:
+ w4 = word;
+ diag = kdb_putarea(addr, w4);
+ break;
+ case 8:
+ if (size <= sizeof(word)) {
+ w8 = word;
+ diag = kdb_putarea(addr, w8);
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ * letters DRSTCZEUIMA to a mask for the process state field and
+ * return the value. If no argument is supplied, return the mask
+ * that corresponds to environment variable PS, DRSTCZEU by
+ * default.
+ * Inputs:
+ * s String to convert
+ * Returns:
+ * Mask for process state.
+ * Notes:
+ * The mask folds data from several sources into a single long value, so
+ * be carefull not to overlap the bits. TASK_* bits are in the LSB,
+ * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
+ * is no overlap between TASK_* and EXIT_* but that may not always be
+ * true, so EXIT_* bits are shifted left 16 bits before being stored in
+ * the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+ long res = 0;
+ if (!s) {
+ s = kdbgetenv("PS");
+ if (!s)
+ s = "DRSTCZEU"; /* default value for ps */
+ }
+ while (*s) {
+ switch (*s) {
+ case 'D':
+ res |= TASK_UNINTERRUPTIBLE;
+ break;
+ case 'R':
+ res |= RUNNING;
+ break;
+ case 'S':
+ res |= TASK_INTERRUPTIBLE;
+ break;
+ case 'T':
+ res |= TASK_STOPPED;
+ break;
+ case 'C':
+ res |= TASK_TRACED;
+ break;
+ case 'Z':
+ res |= EXIT_ZOMBIE << 16;
+ break;
+ case 'E':
+ res |= EXIT_DEAD << 16;
+ break;
+ case 'U':
+ res |= UNRUNNABLE;
+ break;
+ case 'I':
+ res |= IDLE;
+ break;
+ case 'M':
+ res |= DAEMON;
+ break;
+ case 'A':
+ res = ~0UL;
+ break;
+ default:
+ kdb_printf("%s: unknown flag '%c' ignored\n",
+ __func__, *s);
+ break;
+ }
+ ++s;
+ }
+ return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ * p struct task for the process
+ * Returns:
+ * One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+ int cpu;
+ char state;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return 'E';
+
+ cpu = kdb_process_cpu(p);
+ state = (p->state == 0) ? 'R' :
+ (p->state < 0) ? 'U' :
+ (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+ (p->state & TASK_STOPPED) ? 'T' :
+ (p->state & TASK_TRACED) ? 'C' :
+ (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+ (p->exit_state & EXIT_DEAD) ? 'E' :
+ (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ if (p->pid == 0) {
+ /* Idle task. Is it really idle, apart from the kdb
+ * interrupt? */
+ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+ if (cpu != kdb_initial_cpu)
+ state = 'I'; /* idle task */
+ }
+ } else if (!p->mm && state == 'S') {
+ state = 'M'; /* sleeping system daemon */
+ }
+ return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ * given by the mask.
+ * Inputs:
+ * p struct task for the process
+ * mask mask from kdb_task_state_string to select processes
+ * Returns:
+ * True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+ char state[] = { kdb_task_state_char(p), '\0' };
+ return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ * value to a symbol lookup if possible.
+ * Inputs:
+ * name field name to print
+ * val value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+ kdb_symtab_t symtab;
+ kdb_printf(" %-11.11s ", name);
+ if (kdbnearsym(val, &symtab))
+ kdb_symbol_print(val, &symtab,
+ KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+ else
+ kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
+ * for space usage, not for speed. One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+ u32 next; /* offset of next header from start of pool */
+ u32 size;
+ void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size. Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward. The debug code is called from all contexts,
+ * including non maskable interrupts. A normal spinlock is not safe
+ * in NMI context. Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up. If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+ __acquires(dap_lock)
+{
+ static int dap_locked = -1;
+ int count;
+ if (dap_locked == smp_processor_id())
+ count = 1;
+ else
+ count = 1000;
+ while (1) {
+ if (spin_trylock(&dap_lock)) {
+ dap_locked = -1;
+ return 1;
+ }
+ if (!count--)
+ break;
+ udelay(1000);
+ }
+ dap_locked = smp_processor_id();
+ __acquire(dap_lock);
+ return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+ unsigned int rem, h_offset;
+ struct debug_alloc_header *best, *bestprev, *prev, *h;
+ void *p = NULL;
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return NULL;
+ }
+ h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first_call) {
+ h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+ dah_first_call = 0;
+ }
+ size = ALIGN(size, dah_align);
+ prev = best = bestprev = NULL;
+ while (1) {
+ if (h->size >= size && (!best || h->size < best->size)) {
+ best = h;
+ bestprev = prev;
+ if (h->size == size)
+ break;
+ }
+ if (!h->next)
+ break;
+ prev = h;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+ }
+ if (!best)
+ goto out;
+ rem = best->size - size;
+ /* The pool must always contain at least one header */
+ if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+ goto out;
+ if (rem >= dah_overhead) {
+ best->size = size;
+ h_offset = ((char *)best - debug_alloc_pool) +
+ dah_overhead + best->size;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+ h->size = rem - dah_overhead;
+ h->next = best->next;
+ } else
+ h_offset = best->next;
+ best->caller = __builtin_return_address(0);
+ dah_used += best->size;
+ dah_used_max = max(dah_used, dah_used_max);
+ if (bestprev)
+ bestprev->next = h_offset;
+ else
+ dah_first = h_offset;
+ p = (char *)best + dah_overhead;
+ memset(p, POISON_INUSE, best->size - 1);
+ *((char *)p + best->size - 1) = POISON_END;
+out:
+ spin_unlock(&dap_lock);
+ return p;
+}
+
+void debug_kfree(void *p)
+{
+ struct debug_alloc_header *h;
+ unsigned int h_offset;
+ if (!p)
+ return;
+ if ((char *)p < debug_alloc_pool ||
+ (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+ kfree(p);
+ return;
+ }
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return; /* memory leak, cannot be helped */
+ }
+ h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+ memset(p, POISON_FREE, h->size - 1);
+ *((char *)p + h->size - 1) = POISON_END;
+ h->caller = NULL;
+ dah_used -= h->size;
+ h_offset = (char *)h - debug_alloc_pool;
+ if (h_offset < dah_first) {
+ h->next = dah_first;
+ dah_first = h_offset;
+ } else {
+ struct debug_alloc_header *prev;
+ unsigned int prev_offset;
+ prev = (struct debug_alloc_header *)(debug_alloc_pool +
+ dah_first);
+ while (1) {
+ if (!prev->next || prev->next > h_offset)
+ break;
+ prev = (struct debug_alloc_header *)
+ (debug_alloc_pool + prev->next);
+ }
+ prev_offset = (char *)prev - debug_alloc_pool;
+ if (prev_offset + dah_overhead + prev->size == h_offset) {
+ prev->size += dah_overhead + h->size;
+ memset(h, POISON_FREE, dah_overhead - 1);
+ *((char *)h + dah_overhead - 1) = POISON_END;
+ h = prev;
+ h_offset = prev_offset;
+ } else {
+ h->next = prev->next;
+ prev->next = h_offset;
+ }
+ }
+ if (h_offset + dah_overhead + h->size == h->next) {
+ struct debug_alloc_header *next;
+ next = (struct debug_alloc_header *)
+ (debug_alloc_pool + h->next);
+ h->size += dah_overhead + next->size;
+ h->next = next->next;
+ memset(next, POISON_FREE, dah_overhead - 1);
+ *((char *)next + dah_overhead - 1) = POISON_END;
+ }
+ spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+ struct debug_alloc_header *h_free, *h_used;
+#ifdef CONFIG_IA64
+ /* FIXME: using dah for ia64 unwind always results in a memory leak.
+ * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+ * all architectures.
+ */
+ static int debug_kusage_one_time;
+#else
+ static int debug_kusage_one_time = 1;
+#endif
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return;
+ }
+ h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first == 0 &&
+ (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+ dah_first_call))
+ goto out;
+ if (!debug_kusage_one_time)
+ goto out;
+ debug_kusage_one_time = 0;
+ kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+ __func__, dah_first);
+ if (dah_first) {
+ h_used = (struct debug_alloc_header *)debug_alloc_pool;
+ kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ h_used->size);
+ }
+ do {
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+ h_free = (struct debug_alloc_header *)
+ (debug_alloc_pool + h_free->next);
+ } while (h_free->next);
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ if ((char *)h_used - debug_alloc_pool !=
+ sizeof(debug_alloc_pool_aligned))
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+out:
+ spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+ BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+ kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+ BUG_ON(kdb_flags_index <= 0);
+ kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 31aa9332ef3f..7bfae887f211 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -7,6 +7,8 @@
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <linux/early_res.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
/*
* Early reserved memory areas.
@@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end)
struct early_res *r;
int i;
+ kmemleak_free_part(__va(start), end - start);
+
i = find_overlapped_early(start, end);
r = &early_res[i];
if (i >= max_early_res || r->end != end || r->start != start)
@@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end)
struct early_res *r;
int i;
+ kmemleak_free_part(__va(start), end - start);
+
if (start == end)
return;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
}
static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
{
- struct exec_domain * ep;
- u_long pers = personality(personality);
+ unsigned int pers = personality(personality);
+ struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
#ifdef CONFIG_MODULES
read_unlock(&exec_domains_lock);
- request_module("personality-%ld", pers);
+ request_module("personality-%d", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -134,23 +134,14 @@ unregister:
return 0;
}
-int
-__set_personality(u_long personality)
+int __set_personality(unsigned int personality)
{
- struct exec_domain *ep, *oep;
-
- ep = lookup_exec_domain(personality);
- if (ep == current_thread_info()->exec_domain) {
- current->personality = personality;
- module_put(ep->module);
- return 0;
- }
+ struct exec_domain *oep = current_thread_info()->exec_domain;
+ current_thread_info()->exec_domain = lookup_exec_domain(personality);
current->personality = personality;
- oep = current_thread_info()->exec_domain;
- current_thread_info()->exec_domain = ep;
-
module_put(oep->module);
+
return 0;
}
@@ -188,17 +179,14 @@ static int __init proc_execdomains_init(void)
module_init(proc_execdomains_init);
#endif
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
{
- u_long old = current->personality;
+ unsigned int old = current->personality;
- if (personality != 0xffffffff) {
+ if (personality != 0xffffffff)
set_personality(personality);
- if (current->personality != personality)
- return -EINVAL;
- }
- return (long)old;
+ return old;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index eabca5a73a85..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -58,11 +58,11 @@
static void exit_mm(struct task_struct * tsk);
-static void __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
- if (thread_group_leader(p)) {
+ if (group_dead) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
@@ -79,10 +79,9 @@ static void __unhash_process(struct task_struct *p)
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
+ bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
-
- BUG_ON(!sig);
- BUG_ON(!atomic_read(&sig->count));
+ struct tty_struct *uninitialized_var(tty);
sighand = rcu_dereference_check(tsk->sighand,
rcu_read_lock_held() ||
@@ -90,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk)
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
- if (atomic_dec_and_test(&sig->count))
+ if (group_dead) {
posix_cpu_timers_exit_group(tsk);
- else {
+ tty = sig->tty;
+ sig->tty = NULL;
+ } else {
/*
* If there is any task waiting for the group exit
* then notify it:
*/
- if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+ if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
if (tsk == sig->curr_target)
@@ -123,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
- sig = NULL; /* Marker for below. */
}
- __unhash_process(tsk);
+ sig->nr_threads--;
+ __unhash_process(tsk, group_dead);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
-
- tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- if (sig) {
+ if (group_dead) {
flush_sigqueue(&sig->shared_pending);
- taskstats_tgid_free(sig);
- /*
- * Make sure ->signal can't go away under rq->lock,
- * see account_group_exec_runtime().
- */
- task_rq_unlock_wait(tsk);
- __cleanup_signal(sig);
+ tty_kref_put(tty);
}
}
@@ -856,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
- /* mt-exec, de_thread() is waiting for us */
- if (thread_group_leader(tsk) &&
- tsk->signal->group_exit_task &&
- tsk->signal->notify_count < 0)
+ /* mt-exec, de_thread() is waiting for group leader */
+ if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
-
write_unlock_irq(&tasklist_lock);
tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -1002,8 +992,10 @@ NORET_TYPE void do_exit(long code)
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
+ task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
+ task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d57d9e3a6e9..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+ taskstats_tgid_free(sig);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+ if (atomic_dec_and_test(&sig->sigcnt))
+ free_signal_struct(sig);
+}
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
@@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
exit_creds(tsk);
delayacct_tsk_free(tsk);
+ put_signal_struct(tsk->signal);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ sig->nr_threads = 1;
atomic_set(&sig->live, 1);
+ atomic_set(&sig->sigcnt, 1);
init_waitqueue_head(&sig->wait_chldexit);
if (clone_flags & CLONE_NEWPID)
sig->flags |= SIGNAL_UNKILLABLE;
@@ -885,22 +899,16 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sig->oom_adj = current->signal->oom_adj;
+ sig->oom_score_adj = current->signal->oom_score_adj;
return 0;
}
-void __cleanup_signal(struct signal_struct *sig)
-{
- thread_group_cputime_free(sig);
- tty_kref_put(sig->tty);
- kmem_cache_free(signal_cachep, sig);
-}
-
static void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
- new_flags &= ~PF_SUPERPRIV;
+ new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
new_flags |= PF_FORKNOEXEC;
new_flags |= PF_STARTING;
p->flags = new_flags;
@@ -1245,8 +1253,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (clone_flags & CLONE_THREAD) {
- atomic_inc(&current->signal->count);
+ current->signal->nr_threads++;
atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
}
@@ -1259,7 +1268,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
- tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1292,7 +1300,7 @@ bad_fork_cleanup_mm:
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
- __cleanup_signal(p->signal);
+ free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
@@ -1327,6 +1335,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
return regs;
}
+static inline void init_idle_pids(struct pid_link *links)
+{
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&links[type].node); /* not really needed */
+ links[type].pid = &init_struct_pid;
+ }
+}
+
struct task_struct * __cpuinit fork_idle(int cpu)
{
struct task_struct *task;
@@ -1334,8 +1352,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
&init_struct_pid, 0);
- if (!IS_ERR(task))
+ if (!IS_ERR(task)) {
+ init_idle_pids(task->pids);
init_idle(task, cpu);
+ }
return task;
}
@@ -1507,14 +1527,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
*flags_ptr |= CLONE_SIGHAND;
/*
- * If unsharing signal handlers and the task was created
- * using CLONE_THREAD, then must unshare the thread
- */
- if ((*flags_ptr & CLONE_SIGHAND) &&
- (atomic_read(&current->signal->count) > 1))
- *flags_ptr |= CLONE_THREAD;
-
- /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e7..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
- const struct cred *cred = current_cred(), *pcred;
rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p) {
- p = ERR_PTR(-ESRCH);
- } else {
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid)
- p = ERR_PTR(-ESRCH);
- else
- get_task_struct(p);
- }
+ if (p)
+ get_task_struct(p);
rcu_read_unlock();
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (IS_ERR(p))
- return PTR_ERR(p);
+ if (!p)
+ return -ESRCH;
/*
* We need to look at the task state flags to figure out,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b9b134b35088..ce669174f355 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,8 +89,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
do {
seq = read_seqbegin(&xtime_lock);
- xts = current_kernel_time();
- tom = wall_to_monotonic;
+ xts = __current_kernel_time();
+ tom = __get_wall_to_monotonic();
} while (read_seqretry(&xtime_lock, seq));
xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
static int hrtimer_get_target(int this_cpu, int pinned)
{
#ifdef CONFIG_NO_HZ
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
- int preferred_cpu = get_nohz_load_balancer();
-
- if (preferred_cpu >= 0)
- return preferred_cpu;
- }
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+ return get_nohz_timer_target();
#endif
return this_cpu;
}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base;
- struct timespec realtime_offset;
+ struct timespec realtime_offset, wtm;
unsigned long seq;
if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
do {
seq = read_seqbegin(&xtime_lock);
- set_normalized_timespec(&realtime_offset,
- -wall_to_monotonic.tv_sec,
- -wall_to_monotonic.tv_nsec);
+ wtm = __get_wall_to_monotonic();
} while (read_seqretry(&xtime_lock, seq));
+ set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..d71a987fd2bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/smp.h>
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
static int nr_slots[TYPE_MAX];
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
static int constraints_initialized;
/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
return 0;
}
-static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
{
- struct perf_event_context *ctx = tsk->perf_event_ctxp;
- struct list_head *list;
- struct perf_event *bp;
- unsigned long flags;
+ struct perf_event_context *ctx = bp->ctx;
+ struct perf_event *iter;
int count = 0;
- if (WARN_ONCE(!ctx, "No perf context for this task"))
- return 0;
-
- list = &ctx->event_list;
-
- raw_spin_lock_irqsave(&ctx->lock, flags);
-
- /*
- * The current breakpoint counter is not included in the list
- * at the open() callback time
- */
- list_for_each_entry(bp, list, event_entry) {
- if (bp->attr.type == PERF_TYPE_BREAKPOINT)
- if (find_slot_idx(bp) == type)
- count += hw_breakpoint_weight(bp);
+ list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+ if (iter->ctx == ctx && find_slot_idx(iter) == type)
+ count += hw_breakpoint_weight(iter);
}
- raw_spin_unlock_irqrestore(&ctx->lock, flags);
-
return count;
}
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
if (!tsk)
slots->pinned += max_task_bp_pinned(cpu, type);
else
- slots->pinned += task_bp_pinned(tsk, type);
+ slots->pinned += task_bp_pinned(bp, type);
slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
if (!tsk)
nr += max_task_bp_pinned(cpu, type);
else
- nr += task_bp_pinned(tsk, type);
+ nr += task_bp_pinned(bp, type);
if (nr > slots->pinned)
slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
enum bp_type_idx type, int weight)
{
unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
int old_idx = 0;
int idx = 0;
- old_count = task_bp_pinned(tsk, type);
+ old_count = task_bp_pinned(bp, type);
old_idx = old_count - 1;
idx = old_idx + weight;
+ /* tsk_pinned[n] is the number of tasks having n breakpoints */
tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
if (enable) {
tsk_pinned[idx]++;
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int cpu = bp->cpu;
struct task_struct *tsk = bp->ctx->task;
+ /* Pinned counter cpu profiling */
+ if (!tsk) {
+
+ if (enable)
+ per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+ else
+ per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+ return;
+ }
+
/* Pinned counter task profiling */
- if (tsk) {
- if (cpu >= 0) {
- toggle_bp_task_slot(tsk, cpu, enable, type, weight);
- return;
- }
+ if (!enable)
+ list_del(&bp->hw.bp_list);
+
+ if (cpu >= 0) {
+ toggle_bp_task_slot(bp, cpu, enable, type, weight);
+ } else {
for_each_online_cpu(cpu)
- toggle_bp_task_slot(tsk, cpu, enable, type, weight);
- return;
+ toggle_bp_task_slot(bp, cpu, enable, type, weight);
}
- /* Pinned counter cpu profiling */
if (enable)
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
- else
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+ list_add_tail(&bp->hw.bp_list, &bp_task_head);
+}
+
+/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+ /*
+ * A weak stub function here for those archs that don't define
+ * it inside arch/.../kernel/hw_breakpoint.c
+ */
}
/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
weight = hw_breakpoint_weight(bp);
fetch_bp_busy_slots(&slots, bp, type);
+ /*
+ * Simulate the addition of this breakpoint to the constraints
+ * and see the result.
+ */
fetch_this_slot(&slots, weight);
/* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
{
mutex_lock(&nr_bp_mutex);
+ arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp);
mutex_unlock(&nr_bp_mutex);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3164ba7ce151..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
{
if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_TIMER))
+ if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
return;
desc->status |= IRQ_SUSPENDED;
}
@@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
desc->status |= flags;
+
+ if (chip != desc->chip)
+ irq_chip_set_defaults(desc->chip);
}
return ret;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13aff293f4de..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,6 +16,7 @@
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
+#include <linux/kdb.h>
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
@@ -516,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return ret;
}
+#ifdef CONFIG_KGDB_KDB
+const char *kdb_walk_kallsyms(loff_t *pos)
+{
+ static struct kallsym_iter kdb_walk_kallsyms_iter;
+ if (*pos == 0) {
+ memset(&kdb_walk_kallsyms_iter, 0,
+ sizeof(kdb_walk_kallsyms_iter));
+ reset_iter(&kdb_walk_kallsyms_iter, 0);
+ }
+ while (1) {
+ if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
+ return NULL;
+ ++*pos;
+ /* Some debugging symbols have no name. Ignore them. */
+ if (kdb_walk_kallsyms_iter.name[0])
+ return kdb_walk_kallsyms_iter.name;
+ }
+}
+#endif /* CONFIG_KGDB_KDB */
+
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 474a84715eac..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
size_t crash_get_memory_size(void)
{
- size_t size;
+ size_t size = 0;
mutex_lock(&kexec_mutex);
- size = crashk_res.end - crashk_res.start + 1;
+ if (crashk_res.end != crashk_res.start)
+ size = crashk_res.end - crashk_res.start + 1;
mutex_unlock(&kexec_mutex);
return size;
}
@@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size)
free_reserved_phys_range(end, crashk_res.end);
- if (start == end)
+ if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
crashk_res.end = end - 1;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 11f3515ca83f..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1764 +0,0 @@
-/*
- * KGDB stub.
- *
- * Maintainer: Jason Wessel <jason.wessel@windriver.com>
- *
- * Copyright (C) 2000-2001 VERITAS Software Corporation.
- * Copyright (C) 2002-2004 Timesys Corporation
- * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
- * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
- * Copyright (C) 2005-2008 Wind River Systems, Inc.
- * Copyright (C) 2007 MontaVista Software, Inc.
- * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Contributors at various stages not listed above:
- * Jason Wessel ( jason.wessel@windriver.com )
- * George Anzinger <george@mvista.com>
- * Anurekh Saxena (anurekh.saxena@timesys.com)
- * Lake Stevens Instrument Division (Glenn Engel)
- * Jim Kingdon, Cygnus Support.
- *
- * Original KGDB stub: David Grothe <dave@gcom.com>,
- * Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#include <linux/pid_namespace.h>
-#include <linux/clocksource.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/console.h>
-#include <linux/threads.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/sysrq.h>
-#include <linux/init.h>
-#include <linux/kgdb.h>
-#include <linux/pid.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm/cacheflush.h>
-#include <asm/byteorder.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unaligned.h>
-
-static int kgdb_break_asap;
-
-#define KGDB_MAX_THREAD_QUERY 17
-struct kgdb_state {
- int ex_vector;
- int signo;
- int err_code;
- int cpu;
- int pass_exception;
- unsigned long thr_query;
- unsigned long threadid;
- long kgdb_usethreadid;
- struct pt_regs *linux_regs;
-};
-
-/* Exception state values */
-#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
-#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
-#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP 0x8 /* CPU is single stepping */
-
-static struct debuggerinfo_struct {
- void *debuggerinfo;
- struct task_struct *task;
- int exception_state;
-} kgdb_info[NR_CPUS];
-
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
-int kgdb_connected;
-EXPORT_SYMBOL_GPL(kgdb_connected);
-
-/* All the KGDB handlers are installed */
-static int kgdb_io_module_registered;
-
-/* Guard for recursive entry */
-static int exception_level;
-
-static struct kgdb_io *kgdb_io_ops;
-static DEFINE_SPINLOCK(kgdb_registration_lock);
-
-/* kgdb console driver is loaded */
-static int kgdb_con_registered;
-/* determine if kgdb console output should be used */
-static int kgdb_use_con;
-
-static int __init opt_kgdb_con(char *str)
-{
- kgdb_use_con = 1;
- return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
-module_param(kgdb_use_con, int, 0644);
-
-/*
- * Holds information about breakpoints in a kernel. These breakpoints are
- * added and removed by gdb.
- */
-static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
- [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
-};
-
-/*
- * The CPU# of the active CPU, or -1 if none:
- */
-atomic_t kgdb_active = ATOMIC_INIT(-1);
-
-/*
- * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
- * bootup code (which might not have percpu set up yet):
- */
-static atomic_t passive_cpu_wait[NR_CPUS];
-static atomic_t cpu_in_kgdb[NR_CPUS];
-atomic_t kgdb_setting_breakpoint;
-
-struct task_struct *kgdb_usethread;
-struct task_struct *kgdb_contthread;
-
-int kgdb_single_step;
-pid_t kgdb_sstep_pid;
-
-/* Our I/O buffers. */
-static char remcom_in_buffer[BUFMAX];
-static char remcom_out_buffer[BUFMAX];
-
-/* Storage for the registers, in GDB format. */
-static unsigned long gdb_regs[(NUMREGBYTES +
- sizeof(unsigned long) - 1) /
- sizeof(unsigned long)];
-
-/* to keep track of the CPU which is doing the single stepping*/
-atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
-
-/*
- * If you are debugging a problem where roundup (the collection of
- * all other CPUs) is a problem [this should be extremely rare],
- * then use the nokgdbroundup option to avoid roundup. In that case
- * the other CPUs might interfere with your debugging context, so
- * use this with care:
- */
-static int kgdb_do_roundup = 1;
-
-static int __init opt_nokgdbroundup(char *str)
-{
- kgdb_do_roundup = 0;
-
- return 0;
-}
-
-early_param("nokgdbroundup", opt_nokgdbroundup);
-
-/*
- * Finally, some KGDB code :-)
- */
-
-/*
- * Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
- */
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
-{
- int err;
-
- err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
- if (err)
- return err;
-
- return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
-{
- return probe_kernel_write((char *)addr,
- (char *)bundle, BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
- char tmp_variable[BREAK_INSTR_SIZE];
- int err;
- /* Validate setting the breakpoint and then removing it. In the
- * remove fails, the kernel needs to emit a bad message because we
- * are deep trouble not being able to put things back the way we
- * found them.
- */
- err = kgdb_arch_set_breakpoint(addr, tmp_variable);
- if (err)
- return err;
- err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
- if (err)
- printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
- "memory destroyed at: %lx", addr);
- return err;
-}
-
-unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
-{
- return instruction_pointer(regs);
-}
-
-int __weak kgdb_arch_init(void)
-{
- return 0;
-}
-
-int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
-{
- return 0;
-}
-
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
- return;
-}
-
-/**
- * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- * @regs: Current &struct pt_regs.
- *
- * This function will be called if the particular architecture must
- * disable hardware debugging while it is processing gdb packets or
- * handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
-
-/*
- * GDB remote protocol parser:
- */
-
-static int hex(char ch)
-{
- if ((ch >= 'a') && (ch <= 'f'))
- return ch - 'a' + 10;
- if ((ch >= '0') && (ch <= '9'))
- return ch - '0';
- if ((ch >= 'A') && (ch <= 'F'))
- return ch - 'A' + 10;
- return -1;
-}
-
-/* scan for the sequence $<data>#<checksum> */
-static void get_packet(char *buffer)
-{
- unsigned char checksum;
- unsigned char xmitcsum;
- int count;
- char ch;
-
- do {
- /*
- * Spin and wait around for the start character, ignore all
- * other characters:
- */
- while ((ch = (kgdb_io_ops->read_char())) != '$')
- /* nothing */;
-
- kgdb_connected = 1;
- checksum = 0;
- xmitcsum = -1;
-
- count = 0;
-
- /*
- * now, read until a # or end of buffer is found:
- */
- while (count < (BUFMAX - 1)) {
- ch = kgdb_io_ops->read_char();
- if (ch == '#')
- break;
- checksum = checksum + ch;
- buffer[count] = ch;
- count = count + 1;
- }
- buffer[count] = 0;
-
- if (ch == '#') {
- xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
- xmitcsum += hex(kgdb_io_ops->read_char());
-
- if (checksum != xmitcsum)
- /* failed checksum */
- kgdb_io_ops->write_char('-');
- else
- /* successful transfer */
- kgdb_io_ops->write_char('+');
- if (kgdb_io_ops->flush)
- kgdb_io_ops->flush();
- }
- } while (checksum != xmitcsum);
-}
-
-/*
- * Send the packet in buffer.
- * Check for gdb connection if asked for.
- */
-static void put_packet(char *buffer)
-{
- unsigned char checksum;
- int count;
- char ch;
-
- /*
- * $<packet info>#<checksum>.
- */
- while (1) {
- kgdb_io_ops->write_char('$');
- checksum = 0;
- count = 0;
-
- while ((ch = buffer[count])) {
- kgdb_io_ops->write_char(ch);
- checksum += ch;
- count++;
- }
-
- kgdb_io_ops->write_char('#');
- kgdb_io_ops->write_char(hex_asc_hi(checksum));
- kgdb_io_ops->write_char(hex_asc_lo(checksum));
- if (kgdb_io_ops->flush)
- kgdb_io_ops->flush();
-
- /* Now see what we get in reply. */
- ch = kgdb_io_ops->read_char();
-
- if (ch == 3)
- ch = kgdb_io_ops->read_char();
-
- /* If we get an ACK, we are done. */
- if (ch == '+')
- return;
-
- /*
- * If we get the start of another packet, this means
- * that GDB is attempting to reconnect. We will NAK
- * the packet being sent, and stop trying to send this
- * packet.
- */
- if (ch == '$') {
- kgdb_io_ops->write_char('-');
- if (kgdb_io_ops->flush)
- kgdb_io_ops->flush();
- return;
- }
- }
-}
-
-/*
- * Convert the memory pointed to by mem into hex, placing result in buf.
- * Return a pointer to the last char put in buf (null). May return an error.
- */
-int kgdb_mem2hex(char *mem, char *buf, int count)
-{
- char *tmp;
- int err;
-
- /*
- * We use the upper half of buf as an intermediate buffer for the
- * raw memory copy. Hex conversion will work against this one.
- */
- tmp = buf + count;
-
- err = probe_kernel_read(tmp, mem, count);
- if (!err) {
- while (count > 0) {
- buf = pack_hex_byte(buf, *tmp);
- tmp++;
- count--;
- }
-
- *buf = 0;
- }
-
- return err;
-}
-
-/*
- * Copy the binary array pointed to by buf into mem. Fix $, #, and
- * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
- */
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
-{
- int size = 0;
- char *c = buf;
-
- while (count-- > 0) {
- c[size] = *buf++;
- if (c[size] == 0x7d)
- c[size] = *buf++ ^ 0x20;
- size++;
- }
-
- return probe_kernel_write(mem, c, size);
-}
-
-/*
- * Convert the hex array pointed to by buf into binary to be placed in mem.
- * Return a pointer to the character AFTER the last byte written.
- * May return an error.
- */
-int kgdb_hex2mem(char *buf, char *mem, int count)
-{
- char *tmp_raw;
- char *tmp_hex;
-
- /*
- * We use the upper half of buf as an intermediate buffer for the
- * raw memory that is converted from hex.
- */
- tmp_raw = buf + count * 2;
-
- tmp_hex = tmp_raw - 1;
- while (tmp_hex >= buf) {
- tmp_raw--;
- *tmp_raw = hex(*tmp_hex--);
- *tmp_raw |= hex(*tmp_hex--) << 4;
- }
-
- return probe_kernel_write(mem, tmp_raw, count);
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int kgdb_hex2long(char **ptr, unsigned long *long_val)
-{
- int hex_val;
- int num = 0;
- int negate = 0;
-
- *long_val = 0;
-
- if (**ptr == '-') {
- negate = 1;
- (*ptr)++;
- }
- while (**ptr) {
- hex_val = hex(**ptr);
- if (hex_val < 0)
- break;
-
- *long_val = (*long_val << 4) | hex_val;
- num++;
- (*ptr)++;
- }
-
- if (negate)
- *long_val = -*long_val;
-
- return num;
-}
-
-/* Write memory due to an 'M' or 'X' packet. */
-static int write_mem_msg(int binary)
-{
- char *ptr = &remcom_in_buffer[1];
- unsigned long addr;
- unsigned long length;
- int err;
-
- if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
- kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
- if (binary)
- err = kgdb_ebin2mem(ptr, (char *)addr, length);
- else
- err = kgdb_hex2mem(ptr, (char *)addr, length);
- if (err)
- return err;
- if (CACHE_FLUSH_IS_SAFE)
- flush_icache_range(addr, addr + length);
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void error_packet(char *pkt, int error)
-{
- error = -error;
- pkt[0] = 'E';
- pkt[1] = hex_asc[(error / 10)];
- pkt[2] = hex_asc[(error % 10)];
- pkt[3] = '\0';
-}
-
-/*
- * Thread ID accessors. We represent a flat TID space to GDB, where
- * the per CPU idle threads (which under Linux all have PID 0) are
- * remapped to negative TIDs.
- */
-
-#define BUF_THREAD_ID_SIZE 16
-
-static char *pack_threadid(char *pkt, unsigned char *id)
-{
- char *limit;
-
- limit = pkt + BUF_THREAD_ID_SIZE;
- while (pkt < limit)
- pkt = pack_hex_byte(pkt, *id++);
-
- return pkt;
-}
-
-static void int_to_threadref(unsigned char *id, int value)
-{
- unsigned char *scan;
- int i = 4;
-
- scan = (unsigned char *)id;
- while (i--)
- *scan++ = 0;
- put_unaligned_be32(value, scan);
-}
-
-static struct task_struct *getthread(struct pt_regs *regs, int tid)
-{
- /*
- * Non-positive TIDs are remapped to the cpu shadow information
- */
- if (tid == 0 || tid == -1)
- tid = -atomic_read(&kgdb_active) - 2;
- if (tid < -1 && tid > -NR_CPUS - 2) {
- if (kgdb_info[-tid - 2].task)
- return kgdb_info[-tid - 2].task;
- else
- return idle_task(-tid - 2);
- }
- if (tid <= 0) {
- printk(KERN_ERR "KGDB: Internal thread select error\n");
- dump_stack();
- return NULL;
- }
-
- /*
- * find_task_by_pid_ns() does not take the tasklist lock anymore
- * but is nicely RCU locked - hence is a pretty resilient
- * thing to use:
- */
- return find_task_by_pid_ns(tid, &init_pid_ns);
-}
-
-/*
- * Some architectures need cache flushes when we set/clear a
- * breakpoint:
- */
-static void kgdb_flush_swbreak_addr(unsigned long addr)
-{
- if (!CACHE_FLUSH_IS_SAFE)
- return;
-
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
- }
- /* Force flush instruction cache if it was outside the mm */
- flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
-}
-
-/*
- * SW breakpoint management:
- */
-static int kgdb_activate_sw_breakpoints(void)
-{
- unsigned long addr;
- int error;
- int ret = 0;
- int i;
-
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if (kgdb_break[i].state != BP_SET)
- continue;
-
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_set_breakpoint(addr,
- kgdb_break[i].saved_instr);
- if (error) {
- ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
- continue;
- }
-
- kgdb_flush_swbreak_addr(addr);
- kgdb_break[i].state = BP_ACTIVE;
- }
- return ret;
-}
-
-static int kgdb_set_sw_break(unsigned long addr)
-{
- int err = kgdb_validate_break_address(addr);
- int breakno = -1;
- int i;
-
- if (err)
- return err;
-
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if ((kgdb_break[i].state == BP_SET) &&
- (kgdb_break[i].bpt_addr == addr))
- return -EEXIST;
- }
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if (kgdb_break[i].state == BP_REMOVED &&
- kgdb_break[i].bpt_addr == addr) {
- breakno = i;
- break;
- }
- }
-
- if (breakno == -1) {
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if (kgdb_break[i].state == BP_UNDEFINED) {
- breakno = i;
- break;
- }
- }
- }
-
- if (breakno == -1)
- return -E2BIG;
-
- kgdb_break[breakno].state = BP_SET;
- kgdb_break[breakno].type = BP_BREAKPOINT;
- kgdb_break[breakno].bpt_addr = addr;
-
- return 0;
-}
-
-static int kgdb_deactivate_sw_breakpoints(void)
-{
- unsigned long addr;
- int error;
- int ret = 0;
- int i;
-
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if (kgdb_break[i].state != BP_ACTIVE)
- continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
- if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
- ret = error;
- }
-
- kgdb_flush_swbreak_addr(addr);
- kgdb_break[i].state = BP_SET;
- }
- return ret;
-}
-
-static int kgdb_remove_sw_break(unsigned long addr)
-{
- int i;
-
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if ((kgdb_break[i].state == BP_SET) &&
- (kgdb_break[i].bpt_addr == addr)) {
- kgdb_break[i].state = BP_REMOVED;
- return 0;
- }
- }
- return -ENOENT;
-}
-
-int kgdb_isremovedbreak(unsigned long addr)
-{
- int i;
-
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if ((kgdb_break[i].state == BP_REMOVED) &&
- (kgdb_break[i].bpt_addr == addr))
- return 1;
- }
- return 0;
-}
-
-static int remove_all_break(void)
-{
- unsigned long addr;
- int error;
- int i;
-
- /* Clear memory breakpoints. */
- for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
- if (kgdb_break[i].state != BP_ACTIVE)
- goto setundefined;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
- if (error)
- printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
- addr);
-setundefined:
- kgdb_break[i].state = BP_UNDEFINED;
- }
-
- /* Clear hardware breakpoints. */
- if (arch_kgdb_ops.remove_all_hw_break)
- arch_kgdb_ops.remove_all_hw_break();
-
- return 0;
-}
-
-/*
- * Remap normal tasks to their real PID,
- * CPU shadow threads are mapped to -CPU - 2
- */
-static inline int shadow_pid(int realpid)
-{
- if (realpid)
- return realpid;
-
- return -raw_smp_processor_id() - 2;
-}
-
-static char gdbmsgbuf[BUFMAX + 1];
-
-static void kgdb_msg_write(const char *s, int len)
-{
- char *bufptr;
- int wcount;
- int i;
-
- /* 'O'utput */
- gdbmsgbuf[0] = 'O';
-
- /* Fill and send buffers... */
- while (len > 0) {
- bufptr = gdbmsgbuf + 1;
-
- /* Calculate how many this time */
- if ((len << 1) > (BUFMAX - 2))
- wcount = (BUFMAX - 2) >> 1;
- else
- wcount = len;
-
- /* Pack in hex chars */
- for (i = 0; i < wcount; i++)
- bufptr = pack_hex_byte(bufptr, s[i]);
- *bufptr = '\0';
-
- /* Move up */
- s += wcount;
- len -= wcount;
-
- /* Write packet */
- put_packet(gdbmsgbuf);
- }
-}
-
-/*
- * Return true if there is a valid kgdb I/O module. Also if no
- * debugger is attached a message can be printed to the console about
- * waiting for the debugger to attach.
- *
- * The print_wait argument is only to be true when called from inside
- * the core kgdb_handle_exception, because it will wait for the
- * debugger to attach.
- */
-static int kgdb_io_ready(int print_wait)
-{
- if (!kgdb_io_ops)
- return 0;
- if (kgdb_connected)
- return 1;
- if (atomic_read(&kgdb_setting_breakpoint))
- return 1;
- if (print_wait)
- printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
- return 1;
-}
-
-/*
- * All the functions that start with gdb_cmd are the various
- * operations to implement the handlers for the gdbserial protocol
- * where KGDB is communicating with an external debugger
- */
-
-/* Handle the '?' status packets */
-static void gdb_cmd_status(struct kgdb_state *ks)
-{
- /*
- * We know that this packet is only sent
- * during initial connect. So to be safe,
- * we clear out our breakpoints now in case
- * GDB is reconnecting.
- */
- remove_all_break();
-
- remcom_out_buffer[0] = 'S';
- pack_hex_byte(&remcom_out_buffer[1], ks->signo);
-}
-
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
-{
- struct task_struct *thread;
- void *local_debuggerinfo;
- int i;
-
- thread = kgdb_usethread;
- if (!thread) {
- thread = kgdb_info[ks->cpu].task;
- local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
- } else {
- local_debuggerinfo = NULL;
- for_each_online_cpu(i) {
- /*
- * Try to find the task on some other
- * or possibly this node if we do not
- * find the matching task then we try
- * to approximate the results.
- */
- if (thread == kgdb_info[i].task)
- local_debuggerinfo = kgdb_info[i].debuggerinfo;
- }
- }
-
- /*
- * All threads that don't have debuggerinfo should be
- * in schedule() sleeping, since all other CPUs
- * are in kgdb_wait, and thus have debuggerinfo.
- */
- if (local_debuggerinfo) {
- pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
- } else {
- /*
- * Pull stuff saved during switch_to; nothing
- * else is accessible (or even particularly
- * relevant).
- *
- * This should be enough for a stack trace.
- */
- sleeping_thread_to_gdb_regs(gdb_regs, thread);
- }
- kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
-}
-
-/* Handle the 'G' set registers request */
-static void gdb_cmd_setregs(struct kgdb_state *ks)
-{
- kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
-
- if (kgdb_usethread && kgdb_usethread != current) {
- error_packet(remcom_out_buffer, -EINVAL);
- } else {
- gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
- strcpy(remcom_out_buffer, "OK");
- }
-}
-
-/* Handle the 'm' memory read bytes */
-static void gdb_cmd_memread(struct kgdb_state *ks)
-{
- char *ptr = &remcom_in_buffer[1];
- unsigned long length;
- unsigned long addr;
- int err;
-
- if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
- kgdb_hex2long(&ptr, &length) > 0) {
- err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
- if (err)
- error_packet(remcom_out_buffer, err);
- } else {
- error_packet(remcom_out_buffer, -EINVAL);
- }
-}
-
-/* Handle the 'M' memory write bytes */
-static void gdb_cmd_memwrite(struct kgdb_state *ks)
-{
- int err = write_mem_msg(0);
-
- if (err)
- error_packet(remcom_out_buffer, err);
- else
- strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'X' memory binary write bytes */
-static void gdb_cmd_binwrite(struct kgdb_state *ks)
-{
- int err = write_mem_msg(1);
-
- if (err)
- error_packet(remcom_out_buffer, err);
- else
- strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'D' or 'k', detach or kill packets */
-static void gdb_cmd_detachkill(struct kgdb_state *ks)
-{
- int error;
-
- /* The detach case */
- if (remcom_in_buffer[0] == 'D') {
- error = remove_all_break();
- if (error < 0) {
- error_packet(remcom_out_buffer, error);
- } else {
- strcpy(remcom_out_buffer, "OK");
- kgdb_connected = 0;
- }
- put_packet(remcom_out_buffer);
- } else {
- /*
- * Assume the kill case, with no exit code checking,
- * trying to force detach the debugger:
- */
- remove_all_break();
- kgdb_connected = 0;
- }
-}
-
-/* Handle the 'R' reboot packets */
-static int gdb_cmd_reboot(struct kgdb_state *ks)
-{
- /* For now, only honor R0 */
- if (strcmp(remcom_in_buffer, "R0") == 0) {
- printk(KERN_CRIT "Executing emergency reboot\n");
- strcpy(remcom_out_buffer, "OK");
- put_packet(remcom_out_buffer);
-
- /*
- * Execution should not return from
- * machine_emergency_restart()
- */
- machine_emergency_restart();
- kgdb_connected = 0;
-
- return 1;
- }
- return 0;
-}
-
-/* Handle the 'q' query packets */
-static void gdb_cmd_query(struct kgdb_state *ks)
-{
- struct task_struct *g;
- struct task_struct *p;
- unsigned char thref[8];
- char *ptr;
- int i;
- int cpu;
- int finished = 0;
-
- switch (remcom_in_buffer[1]) {
- case 's':
- case 'f':
- if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
-
- i = 0;
- remcom_out_buffer[0] = 'm';
- ptr = remcom_out_buffer + 1;
- if (remcom_in_buffer[1] == 'f') {
- /* Each cpu is a shadow thread */
- for_each_online_cpu(cpu) {
- ks->thr_query = 0;
- int_to_threadref(thref, -cpu - 2);
- pack_threadid(ptr, thref);
- ptr += BUF_THREAD_ID_SIZE;
- *(ptr++) = ',';
- i++;
- }
- }
-
- do_each_thread(g, p) {
- if (i >= ks->thr_query && !finished) {
- int_to_threadref(thref, p->pid);
- pack_threadid(ptr, thref);
- ptr += BUF_THREAD_ID_SIZE;
- *(ptr++) = ',';
- ks->thr_query++;
- if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
- finished = 1;
- }
- i++;
- } while_each_thread(g, p);
-
- *(--ptr) = '\0';
- break;
-
- case 'C':
- /* Current thread id */
- strcpy(remcom_out_buffer, "QC");
- ks->threadid = shadow_pid(current->pid);
- int_to_threadref(thref, ks->threadid);
- pack_threadid(remcom_out_buffer + 2, thref);
- break;
- case 'T':
- if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
- ks->threadid = 0;
- ptr = remcom_in_buffer + 17;
- kgdb_hex2long(&ptr, &ks->threadid);
- if (!getthread(ks->linux_regs, ks->threadid)) {
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
- if ((int)ks->threadid > 0) {
- kgdb_mem2hex(getthread(ks->linux_regs,
- ks->threadid)->comm,
- remcom_out_buffer, 16);
- } else {
- static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-
- sprintf(tmpstr, "shadowCPU%d",
- (int)(-ks->threadid - 2));
- kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
- }
- break;
- }
-}
-
-/* Handle the 'H' task query packets */
-static void gdb_cmd_task(struct kgdb_state *ks)
-{
- struct task_struct *thread;
- char *ptr;
-
- switch (remcom_in_buffer[1]) {
- case 'g':
- ptr = &remcom_in_buffer[2];
- kgdb_hex2long(&ptr, &ks->threadid);
- thread = getthread(ks->linux_regs, ks->threadid);
- if (!thread && ks->threadid > 0) {
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
- kgdb_usethread = thread;
- ks->kgdb_usethreadid = ks->threadid;
- strcpy(remcom_out_buffer, "OK");
- break;
- case 'c':
- ptr = &remcom_in_buffer[2];
- kgdb_hex2long(&ptr, &ks->threadid);
- if (!ks->threadid) {
- kgdb_contthread = NULL;
- } else {
- thread = getthread(ks->linux_regs, ks->threadid);
- if (!thread && ks->threadid > 0) {
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
- kgdb_contthread = thread;
- }
- strcpy(remcom_out_buffer, "OK");
- break;
- }
-}
-
-/* Handle the 'T' thread query packets */
-static void gdb_cmd_thread(struct kgdb_state *ks)
-{
- char *ptr = &remcom_in_buffer[1];
- struct task_struct *thread;
-
- kgdb_hex2long(&ptr, &ks->threadid);
- thread = getthread(ks->linux_regs, ks->threadid);
- if (thread)
- strcpy(remcom_out_buffer, "OK");
- else
- error_packet(remcom_out_buffer, -EINVAL);
-}
-
-/* Handle the 'z' or 'Z' breakpoint remove or set packets */
-static void gdb_cmd_break(struct kgdb_state *ks)
-{
- /*
- * Since GDB-5.3, it's been drafted that '0' is a software
- * breakpoint, '1' is a hardware breakpoint, so let's do that.
- */
- char *bpt_type = &remcom_in_buffer[1];
- char *ptr = &remcom_in_buffer[2];
- unsigned long addr;
- unsigned long length;
- int error = 0;
-
- if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
- /* Unsupported */
- if (*bpt_type > '4')
- return;
- } else {
- if (*bpt_type != '0' && *bpt_type != '1')
- /* Unsupported. */
- return;
- }
-
- /*
- * Test if this is a hardware breakpoint, and
- * if we support it:
- */
- if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
- /* Unsupported. */
- return;
-
- if (*(ptr++) != ',') {
- error_packet(remcom_out_buffer, -EINVAL);
- return;
- }
- if (!kgdb_hex2long(&ptr, &addr)) {
- error_packet(remcom_out_buffer, -EINVAL);
- return;
- }
- if (*(ptr++) != ',' ||
- !kgdb_hex2long(&ptr, &length)) {
- error_packet(remcom_out_buffer, -EINVAL);
- return;
- }
-
- if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
- error = kgdb_set_sw_break(addr);
- else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
- error = kgdb_remove_sw_break(addr);
- else if (remcom_in_buffer[0] == 'Z')
- error = arch_kgdb_ops.set_hw_breakpoint(addr,
- (int)length, *bpt_type - '0');
- else if (remcom_in_buffer[0] == 'z')
- error = arch_kgdb_ops.remove_hw_breakpoint(addr,
- (int) length, *bpt_type - '0');
-
- if (error == 0)
- strcpy(remcom_out_buffer, "OK");
- else
- error_packet(remcom_out_buffer, error);
-}
-
-/* Handle the 'C' signal / exception passing packets */
-static int gdb_cmd_exception_pass(struct kgdb_state *ks)
-{
- /* C09 == pass exception
- * C15 == detach kgdb, pass exception
- */
- if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
-
- ks->pass_exception = 1;
- remcom_in_buffer[0] = 'c';
-
- } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
-
- ks->pass_exception = 1;
- remcom_in_buffer[0] = 'D';
- remove_all_break();
- kgdb_connected = 0;
- return 1;
-
- } else {
- kgdb_msg_write("KGDB only knows signal 9 (pass)"
- " and 15 (pass and disconnect)\n"
- "Executing a continue without signal passing\n", 0);
- remcom_in_buffer[0] = 'c';
- }
-
- /* Indicate fall through */
- return -1;
-}
-
-/*
- * This function performs all gdbserial command procesing
- */
-static int gdb_serial_stub(struct kgdb_state *ks)
-{
- int error = 0;
- int tmp;
-
- /* Clear the out buffer. */
- memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
- if (kgdb_connected) {
- unsigned char thref[8];
- char *ptr;
-
- /* Reply to host that an exception has occurred */
- ptr = remcom_out_buffer;
- *ptr++ = 'T';
- ptr = pack_hex_byte(ptr, ks->signo);
- ptr += strlen(strcpy(ptr, "thread:"));
- int_to_threadref(thref, shadow_pid(current->pid));
- ptr = pack_threadid(ptr, thref);
- *ptr++ = ';';
- put_packet(remcom_out_buffer);
- }
-
- kgdb_usethread = kgdb_info[ks->cpu].task;
- ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
- ks->pass_exception = 0;
-
- while (1) {
- error = 0;
-
- /* Clear the out buffer. */
- memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
- get_packet(remcom_in_buffer);
-
- switch (remcom_in_buffer[0]) {
- case '?': /* gdbserial status */
- gdb_cmd_status(ks);
- break;
- case 'g': /* return the value of the CPU registers */
- gdb_cmd_getregs(ks);
- break;
- case 'G': /* set the value of the CPU registers - return OK */
- gdb_cmd_setregs(ks);
- break;
- case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
- gdb_cmd_memread(ks);
- break;
- case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
- gdb_cmd_memwrite(ks);
- break;
- case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
- gdb_cmd_binwrite(ks);
- break;
- /* kill or detach. KGDB should treat this like a
- * continue.
- */
- case 'D': /* Debugger detach */
- case 'k': /* Debugger detach via kill */
- gdb_cmd_detachkill(ks);
- goto default_handle;
- case 'R': /* Reboot */
- if (gdb_cmd_reboot(ks))
- goto default_handle;
- break;
- case 'q': /* query command */
- gdb_cmd_query(ks);
- break;
- case 'H': /* task related */
- gdb_cmd_task(ks);
- break;
- case 'T': /* Query thread status */
- gdb_cmd_thread(ks);
- break;
- case 'z': /* Break point remove */
- case 'Z': /* Break point set */
- gdb_cmd_break(ks);
- break;
- case 'C': /* Exception passing */
- tmp = gdb_cmd_exception_pass(ks);
- if (tmp > 0)
- goto default_handle;
- if (tmp == 0)
- break;
- /* Fall through on tmp < 0 */
- case 'c': /* Continue packet */
- case 's': /* Single step packet */
- if (kgdb_contthread && kgdb_contthread != current) {
- /* Can't switch threads in kgdb */
- error_packet(remcom_out_buffer, -EINVAL);
- break;
- }
- kgdb_activate_sw_breakpoints();
- /* Fall through to default processing */
- default:
-default_handle:
- error = kgdb_arch_handle_exception(ks->ex_vector,
- ks->signo,
- ks->err_code,
- remcom_in_buffer,
- remcom_out_buffer,
- ks->linux_regs);
- /*
- * Leave cmd processing on error, detach,
- * kill, continue, or single step.
- */
- if (error >= 0 || remcom_in_buffer[0] == 'D' ||
- remcom_in_buffer[0] == 'k') {
- error = 0;
- goto kgdb_exit;
- }
-
- }
-
- /* reply to the request */
- put_packet(remcom_out_buffer);
- }
-
-kgdb_exit:
- if (ks->pass_exception)
- error = 1;
- return error;
-}
-
-static int kgdb_reenter_check(struct kgdb_state *ks)
-{
- unsigned long addr;
-
- if (atomic_read(&kgdb_active) != raw_smp_processor_id())
- return 0;
-
- /* Panic on recursive debugger calls: */
- exception_level++;
- addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
- kgdb_deactivate_sw_breakpoints();
-
- /*
- * If the break point removed ok at the place exception
- * occurred, try to recover and print a warning to the end
- * user because the user planted a breakpoint in a place that
- * KGDB needs in order to function.
- */
- if (kgdb_remove_sw_break(addr) == 0) {
- exception_level = 0;
- kgdb_skipexception(ks->ex_vector, ks->linux_regs);
- kgdb_activate_sw_breakpoints();
- printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
- addr);
- WARN_ON_ONCE(1);
-
- return 1;
- }
- remove_all_break();
- kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-
- if (exception_level > 1) {
- dump_stack();
- panic("Recursive entry to debugger");
- }
-
- printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
- dump_stack();
- panic("Recursive entry to debugger");
-
- return 1;
-}
-
-static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
-{
- unsigned long flags;
- int sstep_tries = 100;
- int error = 0;
- int i, cpu;
- int trace_on = 0;
-acquirelock:
- /*
- * Interrupts will be restored by the 'trap return' code, except when
- * single stepping.
- */
- local_irq_save(flags);
-
- cpu = ks->cpu;
- kgdb_info[cpu].debuggerinfo = regs;
- kgdb_info[cpu].task = current;
- /*
- * Make sure the above info reaches the primary CPU before
- * our cpu_in_kgdb[] flag setting does:
- */
- atomic_inc(&cpu_in_kgdb[cpu]);
-
- /*
- * CPU will loop if it is a slave or request to become a kgdb
- * master cpu and acquire the kgdb_active lock:
- */
- while (1) {
- if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
- if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
- break;
- } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
- if (!atomic_read(&passive_cpu_wait[cpu]))
- goto return_normal;
- } else {
-return_normal:
- /* Return to normal operation by executing any
- * hw breakpoint fixup.
- */
- if (arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- if (trace_on)
- tracing_on();
- atomic_dec(&cpu_in_kgdb[cpu]);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
- local_irq_restore(flags);
- return 0;
- }
- cpu_relax();
- }
-
- /*
- * For single stepping, try to only enter on the processor
- * that was single stepping. To gaurd against a deadlock, the
- * kernel will only try for the value of sstep_tries before
- * giving up and continuing on.
- */
- if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
- (kgdb_info[cpu].task &&
- kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
- atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
- local_irq_restore(flags);
-
- goto acquirelock;
- }
-
- if (!kgdb_io_ready(1)) {
- error = 1;
- goto kgdb_restore; /* No I/O connection, so resume the system */
- }
-
- /*
- * Don't enter if we have hit a removed breakpoint.
- */
- if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
- goto kgdb_restore;
-
- /* Call the I/O driver's pre_exception routine */
- if (kgdb_io_ops->pre_exception)
- kgdb_io_ops->pre_exception();
-
- kgdb_disable_hw_debug(ks->linux_regs);
-
- /*
- * Get the passive CPU lock which will hold all the non-primary
- * CPU in a spin state while the debugger is active
- */
- if (!kgdb_single_step) {
- for (i = 0; i < NR_CPUS; i++)
- atomic_inc(&passive_cpu_wait[i]);
- }
-
-#ifdef CONFIG_SMP
- /* Signal the other CPUs to enter kgdb_wait() */
- if ((!kgdb_single_step) && kgdb_do_roundup)
- kgdb_roundup_cpus(flags);
-#endif
-
- /*
- * Wait for the other CPUs to be notified and be waiting for us:
- */
- for_each_online_cpu(i) {
- while (!atomic_read(&cpu_in_kgdb[i]))
- cpu_relax();
- }
-
- /*
- * At this point the primary processor is completely
- * in the debugger and all secondary CPUs are quiescent
- */
- kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
- kgdb_deactivate_sw_breakpoints();
- kgdb_single_step = 0;
- kgdb_contthread = current;
- exception_level = 0;
- trace_on = tracing_is_on();
- if (trace_on)
- tracing_off();
-
- /* Talk to debugger with gdbserial protocol */
- error = gdb_serial_stub(ks);
-
- /* Call the I/O driver's post_exception routine */
- if (kgdb_io_ops->post_exception)
- kgdb_io_ops->post_exception();
-
- atomic_dec(&cpu_in_kgdb[ks->cpu]);
-
- if (!kgdb_single_step) {
- for (i = NR_CPUS-1; i >= 0; i--)
- atomic_dec(&passive_cpu_wait[i]);
- /*
- * Wait till all the CPUs have quit
- * from the debugger.
- */
- for_each_online_cpu(i) {
- while (atomic_read(&cpu_in_kgdb[i]))
- cpu_relax();
- }
- }
-
-kgdb_restore:
- if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
- int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
- if (kgdb_info[sstep_cpu].task)
- kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
- else
- kgdb_sstep_pid = 0;
- }
- if (trace_on)
- tracing_on();
- /* Free kgdb_active */
- atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
- local_irq_restore(flags);
-
- return error;
-}
-
-/*
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- * interface locks, if any (begin_session)
- * kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
-{
- struct kgdb_state kgdb_var;
- struct kgdb_state *ks = &kgdb_var;
- int ret;
-
- ks->cpu = raw_smp_processor_id();
- ks->ex_vector = evector;
- ks->signo = signo;
- ks->ex_vector = evector;
- ks->err_code = ecode;
- ks->kgdb_usethreadid = 0;
- ks->linux_regs = regs;
-
- if (kgdb_reenter_check(ks))
- return 0; /* Ouch, double exception ! */
- kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
- ret = kgdb_cpu_enter(ks, regs);
- kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
- return ret;
-}
-
-int kgdb_nmicallback(int cpu, void *regs)
-{
-#ifdef CONFIG_SMP
- struct kgdb_state kgdb_var;
- struct kgdb_state *ks = &kgdb_var;
-
- memset(ks, 0, sizeof(struct kgdb_state));
- ks->cpu = cpu;
- ks->linux_regs = regs;
-
- if (!atomic_read(&cpu_in_kgdb[cpu]) &&
- atomic_read(&kgdb_active) != -1 &&
- atomic_read(&kgdb_active) != cpu) {
- kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
- kgdb_cpu_enter(ks, regs);
- kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
- return 0;
- }
-#endif
- return 1;
-}
-
-static void kgdb_console_write(struct console *co, const char *s,
- unsigned count)
-{
- unsigned long flags;
-
- /* If we're debugging, or KGDB has not connected, don't try
- * and print. */
- if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
- return;
-
- local_irq_save(flags);
- kgdb_msg_write(s, count);
- local_irq_restore(flags);
-}
-
-static struct console kgdbcons = {
- .name = "kgdb",
- .write = kgdb_console_write,
- .flags = CON_PRINTBUFFER | CON_ENABLED,
- .index = -1,
-};
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_gdb(int key, struct tty_struct *tty)
-{
- if (!kgdb_io_ops) {
- printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
- return;
- }
- if (!kgdb_connected)
- printk(KERN_CRIT "Entering KGDB\n");
-
- kgdb_breakpoint();
-}
-
-static struct sysrq_key_op sysrq_gdb_op = {
- .handler = sysrq_handle_gdb,
- .help_msg = "debug(G)",
- .action_msg = "DEBUG",
-};
-#endif
-
-static void kgdb_register_callbacks(void)
-{
- if (!kgdb_io_module_registered) {
- kgdb_io_module_registered = 1;
- kgdb_arch_init();
-#ifdef CONFIG_MAGIC_SYSRQ
- register_sysrq_key('g', &sysrq_gdb_op);
-#endif
- if (kgdb_use_con && !kgdb_con_registered) {
- register_console(&kgdbcons);
- kgdb_con_registered = 1;
- }
- }
-}
-
-static void kgdb_unregister_callbacks(void)
-{
- /*
- * When this routine is called KGDB should unregister from the
- * panic handler and clean up, making sure it is not handling any
- * break exceptions at the time.
- */
- if (kgdb_io_module_registered) {
- kgdb_io_module_registered = 0;
- kgdb_arch_exit();
-#ifdef CONFIG_MAGIC_SYSRQ
- unregister_sysrq_key('g', &sysrq_gdb_op);
-#endif
- if (kgdb_con_registered) {
- unregister_console(&kgdbcons);
- kgdb_con_registered = 0;
- }
- }
-}
-
-static void kgdb_initial_breakpoint(void)
-{
- kgdb_break_asap = 0;
-
- printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
- kgdb_breakpoint();
-}
-
-/**
- * kgdb_register_io_module - register KGDB IO module
- * @new_kgdb_io_ops: the io ops vector
- *
- * Register it with the KGDB core.
- */
-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
-{
- int err;
-
- spin_lock(&kgdb_registration_lock);
-
- if (kgdb_io_ops) {
- spin_unlock(&kgdb_registration_lock);
-
- printk(KERN_ERR "kgdb: Another I/O driver is already "
- "registered with KGDB.\n");
- return -EBUSY;
- }
-
- if (new_kgdb_io_ops->init) {
- err = new_kgdb_io_ops->init();
- if (err) {
- spin_unlock(&kgdb_registration_lock);
- return err;
- }
- }
-
- kgdb_io_ops = new_kgdb_io_ops;
-
- spin_unlock(&kgdb_registration_lock);
-
- printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
- new_kgdb_io_ops->name);
-
- /* Arm KGDB now. */
- kgdb_register_callbacks();
-
- if (kgdb_break_asap)
- kgdb_initial_breakpoint();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(kgdb_register_io_module);
-
-/**
- * kkgdb_unregister_io_module - unregister KGDB IO module
- * @old_kgdb_io_ops: the io ops vector
- *
- * Unregister it with the KGDB core.
- */
-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
-{
- BUG_ON(kgdb_connected);
-
- /*
- * KGDB is no longer able to communicate out, so
- * unregister our callbacks and reset state.
- */
- kgdb_unregister_callbacks();
-
- spin_lock(&kgdb_registration_lock);
-
- WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
- kgdb_io_ops = NULL;
-
- spin_unlock(&kgdb_registration_lock);
-
- printk(KERN_INFO
- "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
- old_kgdb_io_ops->name);
-}
-EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
-
-/**
- * kgdb_breakpoint - generate breakpoint exception
- *
- * This function will generate a breakpoint exception. It is used at the
- * beginning of a program to sync up with a debugger and can be used
- * otherwise as a quick means to stop program execution and "break" into
- * the debugger.
- */
-void kgdb_breakpoint(void)
-{
- atomic_inc(&kgdb_setting_breakpoint);
- wmb(); /* Sync point before breakpoint */
- arch_kgdb_breakpoint();
- wmb(); /* Sync point after breakpoint */
- atomic_dec(&kgdb_setting_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_breakpoint);
-
-static int __init opt_kgdb_wait(char *str)
-{
- kgdb_break_asap = 1;
-
- if (kgdb_io_module_registered)
- kgdb_initial_breakpoint();
-
- return 0;
-}
-
-early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
trace_module_request(module_name, wait, _RET_IP_);
- ret = call_usermodehelper(modprobe_path, argv, envp,
- wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+ ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+ wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
+ NULL, NULL, NULL);
+
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
-struct subprocess_info {
- struct work_struct work;
- struct completion *complete;
- struct cred *cred;
- char *path;
- char **argv;
- char **envp;
- enum umh_wait wait;
- int retval;
- struct file *stdin;
- void (*cleanup)(char **argv, char **envp);
-};
-
/*
* This is the task which runs the usermode application
*/
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
struct subprocess_info *sub_info = data;
int retval;
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-
- /* Unblock all signals */
spin_lock_irq(&current->sighand->siglock);
flush_signal_handlers(current, 1);
- sigemptyset(&current->blocked);
- recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- /* Install the credentials */
- commit_creds(sub_info->cred);
- sub_info->cred = NULL;
-
- /* Install input pipe when needed */
- if (sub_info->stdin) {
- struct files_struct *f = current->files;
- struct fdtable *fdt;
- /* no races because files should be private here */
- sys_close(0);
- fd_install(0, sub_info->stdin);
- spin_lock(&f->file_lock);
- fdt = files_fdtable(f);
- FD_SET(0, fdt->open_fds);
- FD_CLR(0, fdt->close_on_exec);
- spin_unlock(&f->file_lock);
-
- /* and disallow core files too */
- current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
- }
-
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed_ptr(current, cpu_all_mask);
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
*/
set_user_nice(current, 0);
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info);
+ if (retval)
+ goto fail;
+ }
+
retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
+fail:
sub_info->retval = retval;
do_exit(0);
}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
- (*info->cleanup)(info->argv, info->envp);
- if (info->cred)
- put_cred(info->cred);
+ (*info->cleanup)(info);
kfree(info);
}
EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
struct subprocess_info *sub_info = data;
pid_t pid;
- /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
- * populate the status, but will return -ECHILD. */
- allow_signal(SIGCHLD);
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
+ spin_unlock_irq(&current->sighand->siglock);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
- int ret;
-
+ int ret = -ECHILD;
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
sub_info->retval = ret;
}
- if (sub_info->wait == UMH_NO_WAIT)
- call_usermodehelper_freeinfo(sub_info);
- else
- complete(sub_info->complete);
+ complete(sub_info->complete);
return 0;
}
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- pid_t pid;
enum umh_wait wait = sub_info->wait;
-
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+ pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
- if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
+ if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
switch (wait) {
case UMH_NO_WAIT:
+ call_usermodehelper_freeinfo(sub_info);
break;
case UMH_WAIT_PROC:
if (pid > 0)
break;
- sub_info->retval = pid;
/* FALLTHROUGH */
-
case UMH_WAIT_EXEC:
+ if (pid < 0)
+ sub_info->retval = pid;
complete(sub_info->complete);
}
}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
- sub_info->cred = prepare_usermodehelper_creds();
- if (!sub_info->cred) {
- kfree(sub_info);
- return NULL;
- }
-
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
- * call_usermodehelper_setkeys - set the session keys for usermode helper
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @session_keyring: the session keyring for the process
- */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
- struct key *session_keyring)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = info->cred->tgcred;
- key_put(tgcred->session_keyring);
- tgcred->session_keyring = key_get(session_keyring);
-#else
- BUG();
-#endif
-}
-EXPORT_SYMBOL(call_usermodehelper_setkeys);
-
-/**
- * call_usermodehelper_setcleanup - set a cleanup function
+ * call_usermodehelper_setfns - set a cleanup/init function
* @info: a subprocess_info returned by call_usermodehelper_setup
* @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
*
- * The cleanup function is just befor ethe subprocess_info is about to
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
-void call_usermodehelper_setcleanup(struct subprocess_info *info,
- void (*cleanup)(char **argv, char **envp))
+void call_usermodehelper_setfns(struct subprocess_info *info,
+ int (*init)(struct subprocess_info *info),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
{
info->cleanup = cleanup;
+ info->init = init;
+ info->data = data;
}
-EXPORT_SYMBOL(call_usermodehelper_setcleanup);
-
-/**
- * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
- * @sub_info: a subprocess_info returned by call_usermodehelper_setup
- * @filp: set to the write-end of a pipe
- *
- * This constructs a pipe, and sets the read end to be the stdin of the
- * subprocess, and returns the write-end in *@filp.
- */
-int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
- struct file **filp)
-{
- struct file *f;
-
- f = create_write_pipe(0);
- if (IS_ERR(f))
- return PTR_ERR(f);
- *filp = f;
-
- f = create_read_pipe(f, 0);
- if (IS_ERR(f)) {
- free_write_pipe(*filp);
- return PTR_ERR(f);
- }
- sub_info->stdin = f;
-
- return 0;
-}
-EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+EXPORT_SYMBOL(call_usermodehelper_setfns);
/**
* call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
- validate_creds(sub_info->cred);
-
helper_lock();
if (sub_info->path[0] == '\0')
goto out;
@@ -498,41 +416,6 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
-/**
- * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @filp: set to the write-end of a pipe
- *
- * This is a simple wrapper which executes a usermode-helper function
- * with a pipe as stdin. It is implemented entirely in terms of
- * lower-level call_usermodehelper_* functions.
- */
-int call_usermodehelper_pipe(char *path, char **argv, char **envp,
- struct file **filp)
-{
- struct subprocess_info *sub_info;
- int ret;
-
- sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
- if (sub_info == NULL)
- return -ENOMEM;
-
- ret = call_usermodehelper_stdinpipe(sub_info, filp);
- if (ret < 0) {
- call_usermodehelper_freeinfo(sub_info);
- return ret;
- }
-
- ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
- if (ret < 0) /* Failed to execute helper, close pipe */
- filp_close(*filp, NULL);
-
- return ret;
-}
-EXPORT_SYMBOL(call_usermodehelper_pipe);
-
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
extern const void __stop_notes __attribute__((weak));
#define notes_size (&__stop_notes - &__start_notes)
-static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+static ssize_t notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
struct kthread {
int should_stop;
+ void *data;
struct completion exited;
};
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+ return to_kthread(task)->data;
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
int ret;
self.should_stop = 0;
+ self.data = data;
init_completion(&self.exited);
current->vfork_done = &self.exited;
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
return 0;
}
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker. The started kthread will process work_list until
+ * the it is stopped with kthread_stop(). A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time. A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+ struct kthread_worker *worker = worker_ptr;
+ struct kthread_work *work;
+
+ WARN_ON(worker->task);
+ worker->task = current;
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irq(&worker->lock);
+ worker->task = NULL;
+ spin_unlock_irq(&worker->lock);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&worker->lock);
+ if (!list_empty(&worker->work_list)) {
+ work = list_first_entry(&worker->work_list,
+ struct kthread_work, node);
+ list_del_init(&work->node);
+ }
+ spin_unlock_irq(&worker->lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->func(work);
+ smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
+ work->done_seq = work->queue_seq;
+ smp_mb(); /* mb worker-b1 paired with flush-b0 */
+ if (atomic_read(&work->flushing))
+ wake_up_all(&work->done);
+ } else if (!freezing(current))
+ schedule();
+
+ try_to_freeze();
+ goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution. @task
+ * must have been created with kthread_worker_create(). Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ bool ret = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (list_empty(&work->node)) {
+ list_add_tail(&work->node, &worker->work_list);
+ work->queue_seq++;
+ if (likely(worker->task))
+ wake_up_process(worker->task);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+ int seq = work->queue_seq;
+
+ atomic_inc(&work->flushing);
+
+ /*
+ * mb flush-b0 paired with worker-b1, to make sure either
+ * worker sees the above increment or we see done_seq update.
+ */
+ smp_mb__after_atomic_inc();
+
+ /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+ wait_event(work->done, seq - work->done_seq <= 0);
+ atomic_dec(&work->flushing);
+
+ /*
+ * rmb flush-b1 paired with worker-b0, to make sure our caller
+ * sees every change made by work->func().
+ */
+ smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+struct kthread_flush_work {
+ struct kthread_work work;
+ struct completion done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+ struct kthread_flush_work *fwork =
+ container_of(work, struct kthread_flush_work, work);
+ complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+
+ queue_kthread_work(worker, &fwork.work);
+ wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ec21304856d1..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
static inline u64 lockstat_clock(void)
{
- return cpu_clock(smp_processor_id());
+ return local_clock();
}
static int lock_point(unsigned long points[], unsigned long ip)
@@ -2711,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
+struct lock_class_key __lockdep_no_validate__;
+
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
* We maintain the dependency maps and validate the locking attempt:
@@ -2745,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 0;
}
+ if (lock->key == &__lockdep_no_validate__)
+ check = 1;
+
if (!subclass)
class = lock->class_cache;
/*
diff --git a/kernel/module.c b/kernel/module.c
index e2564580f3f1..d0b5f8db11b4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
/*
Copyright (C) 2002 Richard Henderson
- Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -72,11 +72,19 @@
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-/* List of modules, protected by module_mutex or preempt_disable
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
* (delete uses stop_machine/add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -86,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_address */
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
int register_module_notifier(struct notifier_block * nb)
@@ -101,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
}
EXPORT_SYMBOL(unregister_module_notifier);
+struct load_info {
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long *strmap;
+ unsigned long symoffs, stroffs;
+ struct _ddebug *debug;
+ unsigned int num_debug;
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
/* We require a truly strong try_module_get(): 0 means failure due to
ongoing or failed initialization etc. */
static inline int strong_try_module_get(struct module *mod)
@@ -131,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
EXPORT_SYMBOL(__module_put_and_exit);
/* Find a module section: 0 means not found. */
-static unsigned int find_sec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings,
- const char *name)
+static unsigned int find_sec(const struct load_info *info, const char *name)
{
unsigned int i;
- for (i = 1; i < hdr->e_shnum; i++)
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
/* Alloc bit cleared means "ignore it." */
- if ((sechdrs[i].sh_flags & SHF_ALLOC)
- && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
return i;
+ }
return 0;
}
/* Find a module section, or NULL. */
-static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
- const char *secstrings, const char *name)
+static void *section_addr(const struct load_info *info, const char *name)
{
/* Section 0 has sh_addr 0. */
- return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
}
/* Find a module section, or NULL. Fill in number of "objects" in section. */
-static void *section_objs(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings,
+static void *section_objs(const struct load_info *info,
const char *name,
size_t object_size,
unsigned int *num)
{
- unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+ unsigned int sec = find_sec(info, name);
/* Section 0 has sh_addr 0 and sh_size 0. */
- *num = sechdrs[sec].sh_size / object_size;
- return (void *)sechdrs[sec].sh_addr;
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
}
/* Provided by the linker */
@@ -176,8 +195,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const struct kernel_symbol __start___ksymtab_gpl_future[];
extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
extern const unsigned long __start___kcrctab[];
extern const unsigned long __start___kcrctab_gpl[];
extern const unsigned long __start___kcrctab_gpl_future[];
@@ -220,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
unsigned int symnum, void *data), void *data)
{
struct module *mod;
- const struct symsearch arr[] = {
+ static const struct symsearch arr[] = {
{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
NOT_GPL_ONLY, false },
{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -327,7 +344,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
}
/* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it */
+ * (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
@@ -385,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
mod->percpu = __alloc_reserved_percpu(size, align);
if (!mod->percpu) {
printk(KERN_WARNING
- "Could not allocate %lu bytes percpu data\n", size);
+ "%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, size);
return -ENOMEM;
}
mod->percpu_size = size;
@@ -397,11 +415,9 @@ static void percpu_modfree(struct module *mod)
free_percpu(mod->percpu);
}
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static unsigned int find_pcpusec(struct load_info *info)
{
- return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+ return find_sec(info, ".data..percpu");
}
static void percpu_modcopy(struct module *mod,
@@ -461,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
static inline void percpu_modfree(struct module *mod)
{
}
-static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static unsigned int find_pcpusec(struct load_info *info)
{
return 0;
}
@@ -517,36 +531,30 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
/* Init the unload section of the module. */
-static void module_unload_init(struct module *mod)
+static int module_unload_init(struct module *mod)
{
- int cpu;
+ mod->refptr = alloc_percpu(struct module_ref);
+ if (!mod->refptr)
+ return -ENOMEM;
- INIT_LIST_HEAD(&mod->modules_which_use_me);
- for_each_possible_cpu(cpu) {
- per_cpu_ptr(mod->refptr, cpu)->incs = 0;
- per_cpu_ptr(mod->refptr, cpu)->decs = 0;
- }
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
__this_cpu_write(mod->refptr->incs, 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
-}
-/* modules using other modules */
-struct module_use
-{
- struct list_head list;
- struct module *module_which_uses;
-};
+ return 0;
+}
/* Does a already use b? */
static int already_uses(struct module *a, struct module *b)
{
struct module_use *use;
- list_for_each_entry(use, &b->modules_which_use_me, list) {
- if (use->module_which_uses == a) {
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a) {
DEBUGP("%s uses %s!\n", a->name, b->name);
return 1;
}
@@ -555,62 +563,70 @@ static int already_uses(struct module *a, struct module *b)
return 0;
}
-/* Module a uses b */
-int use_module(struct module *a, struct module *b)
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
{
struct module_use *use;
- int no_warn, err;
- if (b == NULL || already_uses(a, b)) return 1;
+ DEBUGP("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use) {
+ printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+ return -ENOMEM;
+ }
+
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+int ref_module(struct module *a, struct module *b)
+{
+ int err;
- /* If we're interrupted or time out, we fail. */
- if (wait_event_interruptible_timeout(
- module_wq, (err = strong_try_module_get(b)) != -EBUSY,
- 30 * HZ) <= 0) {
- printk("%s: gave up waiting for init of module %s.\n",
- a->name, b->name);
+ if (b == NULL || already_uses(a, b))
return 0;
- }
- /* If strong_try_module_get() returned a different error, we fail. */
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
if (err)
- return 0;
+ return err;
- DEBUGP("Allocating new usage for %s.\n", a->name);
- use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- printk("%s: out of memory loading\n", a->name);
+ err = add_module_usage(a, b);
+ if (err) {
module_put(b);
- return 0;
+ return err;
}
-
- use->module_which_uses = a;
- list_add(&use->list, &b->modules_which_use_me);
- no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
- return 1;
+ return 0;
}
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
{
- struct module *i;
-
- list_for_each_entry(i, &modules, list) {
- struct module_use *use;
+ struct module_use *use, *tmp;
- list_for_each_entry(use, &i->modules_which_use_me, list) {
- if (use->module_which_uses == mod) {
- DEBUGP("%s unusing %s\n", mod->name, i->name);
- module_put(i);
- list_del(&use->list);
- kfree(use);
- sysfs_remove_link(i->holders_dir, mod->name);
- /* There can be at most one match. */
- break;
- }
- }
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ DEBUGP("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
}
+ mutex_unlock(&module_mutex);
+
+ free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -733,7 +749,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
goto out;
}
- if (!list_empty(&mod->modules_which_use_me)) {
+ if (!list_empty(&mod->source_list)) {
/* Other modules depend on us: get rid of them first. */
ret = -EWOULDBLOCK;
goto out;
@@ -777,13 +793,13 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
async_synchronize_full();
- mutex_lock(&module_mutex);
+
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
- ddebug_remove_module(mod->name);
- free_module(mod);
- out:
+ free_module(mod);
+ return 0;
+out:
mutex_unlock(&module_mutex);
return ret;
}
@@ -797,9 +813,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
/* Always include a trailing , so userspace can differentiate
between this and the old multi-field proc format. */
- list_for_each_entry(use, &mod->modules_which_use_me, list) {
+ list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
- seq_printf(m, "%s,", use->module_which_uses->name);
+ seq_printf(m, "%s,", use->source->name);
}
if (mod->init != NULL && mod->exit == NULL) {
@@ -878,14 +894,15 @@ static inline void module_unload_free(struct module *mod)
{
}
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
{
- return strong_try_module_get(b) == 0;
+ return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
-static inline void module_unload_init(struct module *mod)
+static inline int module_unload_init(struct module *mod)
{
+ return 0;
}
#endif /* CONFIG_MODULE_UNLOAD */
@@ -999,6 +1016,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
+ /* Since this should be found in kernel (which can't be removed),
+ * no locking is necessary. */
if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
&crc, true, false))
BUG();
@@ -1041,35 +1060,68 @@ static inline int same_magic(const char *amagic, const char *bmagic,
}
#endif /* CONFIG_MODVERSIONS */
-/* Resolve a symbol for this module. I.e. if we find one, record usage.
- Must be holding module_mutex. */
-static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
- unsigned int versindex,
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
const char *name,
- struct module *mod)
+ char ownername[])
{
struct module *owner;
const struct kernel_symbol *sym;
const unsigned long *crc;
+ int err;
+ mutex_lock(&module_mutex);
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- /* use_module can fail due to OOM,
- or module initialization or unloading */
- if (sym) {
- if (!check_version(sechdrs, versindex, name, mod, crc, owner)
- || !use_module(mod, owner))
- sym = NULL;
+ if (!sym)
+ goto unlock;
+
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
+ owner)) {
+ sym = ERR_PTR(-EINVAL);
+ goto getname;
}
+
+ err = ref_module(mod, owner);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
return sym;
}
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
+{
+ const struct kernel_symbol *ksym;
+ char owner[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
+ }
+ return ksym;
+}
+
/*
* /sys/module/foo/sections stuff
* J. Corbet <corbet@lwn.net>
*/
-#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+#ifdef CONFIG_SYSFS
+#ifdef CONFIG_KALLSYMS
static inline bool sect_empty(const Elf_Shdr *sect)
{
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1106,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
kfree(sect_attrs);
}
-static void add_sect_attrs(struct module *mod, unsigned int nsect,
- char *secstrings, Elf_Shdr *sechdrs)
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
{
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
@@ -1115,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
struct attribute **gattr;
/* Count loaded sections and allocate structures */
- for (i = 0; i < nsect; i++)
- if (!sect_empty(&sechdrs[i]))
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
nloaded++;
size[0] = ALIGN(sizeof(*sect_attrs)
+ nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1133,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
gattr = &sect_attrs->grp.attrs[0];
- for (i = 0; i < nsect; i++) {
- if (sect_empty(&sechdrs[i]))
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+ if (sect_empty(sec))
continue;
- sattr->address = sechdrs[i].sh_addr;
- sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ sattr->address = sec->sh_addr;
+ sattr->name = kstrdup(info->secstrings + sec->sh_name,
GFP_KERNEL);
if (sattr->name == NULL)
goto out;
@@ -1182,7 +1234,7 @@ struct module_notes_attrs {
struct bin_attribute attrs[0];
};
-static ssize_t module_notes_read(struct kobject *kobj,
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t pos, size_t count)
{
@@ -1205,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
kfree(notes_attrs);
}
-static void add_notes_attrs(struct module *mod, unsigned int nsect,
- char *secstrings, Elf_Shdr *sechdrs)
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
{
unsigned int notes, loaded, i;
struct module_notes_attrs *notes_attrs;
@@ -1218,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
/* Count notes sections and allocate structures. */
notes = 0;
- for (i = 0; i < nsect; i++)
- if (!sect_empty(&sechdrs[i]) &&
- (sechdrs[i].sh_type == SHT_NOTE))
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ (info->sechdrs[i].sh_type == SHT_NOTE))
++notes;
if (notes == 0)
@@ -1234,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
notes_attrs->notes = notes;
nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < nsect; ++i) {
- if (sect_empty(&sechdrs[i]))
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
continue;
- if (sechdrs[i].sh_type == SHT_NOTE) {
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
sysfs_bin_attr_init(nattr);
nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
nattr->attr.mode = S_IRUGO;
- nattr->size = sechdrs[i].sh_size;
- nattr->private = (void *) sechdrs[i].sh_addr;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *) info->sechdrs[i].sh_addr;
nattr->read = module_notes_read;
++nattr;
}
@@ -1273,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
#else
-static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
- char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_sect_attrs(struct module *mod,
+ const struct load_info *info)
{
}
@@ -1282,18 +1333,44 @@ static inline void remove_sect_attrs(struct module *mod)
{
}
-static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
- char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_notes_attrs(struct module *mod,
+ const struct load_info *info)
{
}
static inline void remove_notes_attrs(struct module *mod)
{
}
+#endif /* CONFIG_KALLSYMS */
+
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+ int nowarn;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ nowarn = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ }
+ mutex_unlock(&module_mutex);
#endif
+}
-#ifdef CONFIG_SYSFS
-int module_add_modinfo_attrs(struct module *mod)
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
struct module_attribute *temp_attr;
@@ -1319,7 +1396,7 @@ int module_add_modinfo_attrs(struct module *mod)
return error;
}
-void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
int i;
@@ -1335,7 +1412,7 @@ void module_remove_modinfo_attrs(struct module *mod)
kfree(mod->modinfo_attrs);
}
-int mod_sysfs_init(struct module *mod)
+static int mod_sysfs_init(struct module *mod)
{
int err;
struct kobject *kobj;
@@ -1369,12 +1446,17 @@ out:
return err;
}
-int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
struct kernel_param *kparam,
unsigned int num_params)
{
int err;
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
if (!mod->holders_dir) {
err = -ENOMEM;
@@ -1389,6 +1471,10 @@ int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_param;
+ add_usage_links(mod);
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
+
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
@@ -1398,24 +1484,44 @@ out_unreg_holders:
kobject_put(mod->holders_dir);
out_unreg:
kobject_put(&mod->mkobj.kobj);
+out:
return err;
}
static void mod_sysfs_fini(struct module *mod)
{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
kobject_put(&mod->mkobj.kobj);
}
-#else /* CONFIG_SYSFS */
+#else /* !CONFIG_SYSFS */
+
+static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
static void mod_sysfs_fini(struct module *mod)
{
}
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+}
+
+static void del_usage_links(struct module *mod)
+{
+}
+
#endif /* CONFIG_SYSFS */
-static void mod_kobject_remove(struct module *mod)
+static void mod_sysfs_teardown(struct module *mod)
{
+ del_usage_links(mod);
module_remove_modinfo_attrs(mod);
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
@@ -1434,16 +1540,19 @@ static int __unlink_module(void *_mod)
return 0;
}
-/* Free a module, remove from lists, etc (must hold module_mutex). */
+/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
trace_module_free(mod);
/* Delete from various lists */
+ mutex_lock(&module_mutex);
stop_machine(__unlink_module, mod, NULL);
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_remove(mod);
+ mutex_unlock(&module_mutex);
+ mod_sysfs_teardown(mod);
+
+ /* Remove dynamic debug info */
+ ddebug_remove_module(mod->name);
/* Arch-specific cleanup. */
module_arch_cleanup(mod);
@@ -1458,10 +1567,7 @@ static void free_module(struct module *mod)
module_free(mod, mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
- if (mod->refptr)
- free_percpu(mod->refptr);
-#endif
+
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1491,6 +1597,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
/*
* Ensure that an exported symbol [global namespace] does not already exist
* in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
*/
static int verify_export_symbols(struct module *mod)
{
@@ -1525,25 +1633,23 @@ static int verify_export_symbols(struct module *mod)
}
/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(Elf_Shdr *sechdrs,
- unsigned int symindex,
- const char *strtab,
- unsigned int versindex,
- unsigned int pcpuindex,
- struct module *mod)
-{
- Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
unsigned long secbase;
- unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ unsigned int i;
int ret = 0;
const struct kernel_symbol *ksym;
- for (i = 1; i < n; i++) {
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
switch (sym[i].st_shndx) {
case SHN_COMMON:
/* We compiled with -fno-common. These are not
supposed to happen. */
- DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+ DEBUGP("Common symbol: %s\n", name);
printk("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
@@ -1556,29 +1662,28 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
- ksym = resolve_symbol(sechdrs, versindex,
- strtab + sym[i].st_name, mod);
+ ksym = resolve_symbol_wait(mod, info, name);
/* Ok if resolved. */
- if (ksym) {
+ if (ksym && !IS_ERR(ksym)) {
sym[i].st_value = ksym->value;
break;
}
/* Ok if weak. */
- if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- printk(KERN_WARNING "%s: Unknown symbol %s\n",
- mod->name, strtab + sym[i].st_name);
- ret = -ENOENT;
+ printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+ mod->name, name, PTR_ERR(ksym));
+ ret = PTR_ERR(ksym) ?: -ENOENT;
break;
default:
/* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == pcpuindex)
+ if (sym[i].st_shndx == info->index.pcpu)
secbase = (unsigned long)mod_percpu(mod);
else
- secbase = sechdrs[sym[i].st_shndx].sh_addr;
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
sym[i].st_value += secbase;
break;
}
@@ -1587,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
return ret;
}
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
/* Additional bytes needed by arch in front of individual sections */
unsigned int __weak arch_mod_section_prepend(struct module *mod,
unsigned int section)
@@ -1611,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size,
might -- code, read-only data, read-write data, small data. Tally
sizes, and place the offsets into sh_entsize fields: high bit means it
belongs in init. */
-static void layout_sections(struct module *mod,
- const Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static void layout_sections(struct module *mod, struct load_info *info)
{
static unsigned long const masks[][2] = {
/* NOTE: all executable code must be the first section
@@ -1627,21 +1758,22 @@ static void layout_sections(struct module *mod,
};
unsigned int m, i;
- for (i = 0; i < hdr->e_shnum; i++)
- sechdrs[i].sh_entsize = ~0UL;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
DEBUGP("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < hdr->e_shnum; ++i) {
- Elf_Shdr *s = &sechdrs[i];
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || strstarts(secstrings + s->sh_name, ".init"))
+ || strstarts(sname, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
- DEBUGP("\t%s\n", secstrings + s->sh_name);
+ DEBUGP("\t%s\n", name);
}
if (m == 0)
mod->core_text_size = mod->core_size;
@@ -1649,17 +1781,18 @@ static void layout_sections(struct module *mod,
DEBUGP("Init section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < hdr->e_shnum; ++i) {
- Elf_Shdr *s = &sechdrs[i];
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || !strstarts(secstrings + s->sh_name, ".init"))
+ || !strstarts(sname, ".init"))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
- DEBUGP("\t%s\n", secstrings + s->sh_name);
+ DEBUGP("\t%s\n", sname);
}
if (m == 0)
mod->init_text_size = mod->init_size;
@@ -1698,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize)
return string;
}
-static char *get_modinfo(Elf_Shdr *sechdrs,
- unsigned int info,
- const char *tag)
+static char *get_modinfo(struct load_info *info, const char *tag)
{
char *p;
unsigned int taglen = strlen(tag);
- unsigned long size = sechdrs[info].sh_size;
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
- for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+ for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
return NULL;
}
-static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
- unsigned int infoindex)
+static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (attr->setup)
- attr->setup(mod,
- get_modinfo(sechdrs,
- infoindex,
- attr->attr.name));
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
}
}
@@ -1765,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value,
}
/* As per nm */
-static char elf_type(const Elf_Sym *sym,
- Elf_Shdr *sechdrs,
- const char *secstrings,
- struct module *mod)
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
return 'v';
@@ -1799,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym,
else
return 'b';
}
- if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
return 'n';
+ }
return '?';
}
@@ -1825,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
return true;
}
-static unsigned long layout_symtab(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- unsigned int strindex,
- const Elf_Ehdr *hdr,
- const char *secstrings,
- unsigned long *pstroffs,
- unsigned long *strmap)
+static void layout_symtab(struct module *mod, struct load_info *info)
{
- unsigned long symoffs;
- Elf_Shdr *symsect = sechdrs + symindex;
- Elf_Shdr *strsect = sechdrs + strindex;
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- const char *strtab;
unsigned int i, nsrc, ndst;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
- symindex) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+ info->index.sym) | INIT_OFFSET_MASK;
+ DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
- src = (void *)hdr + symsect->sh_offset;
+ src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- strtab = (void *)hdr + strsect->sh_offset;
for (ndst = i = 1; i < nsrc; ++i, ++src)
- if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+ if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
unsigned int j = src->st_name;
- while(!__test_and_set_bit(j, strmap) && strtab[j])
+ while (!__test_and_set_bit(j, info->strmap)
+ && info->strtab[j])
++j;
++ndst;
}
/* Append room for core symbols at end of core part. */
- symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+ mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
- strindex) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+ info->index.str) | INIT_OFFSET_MASK;
+ DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
/* Append room for core symbols' strings at end of core part. */
- *pstroffs = mod->core_size;
- __set_bit(0, strmap);
- mod->core_size += bitmap_weight(strmap, strsect->sh_size);
-
- return symoffs;
+ info->stroffs = mod->core_size;
+ __set_bit(0, info->strmap);
+ mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
}
-static void add_kallsyms(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int shnum,
- unsigned int symindex,
- unsigned int strindex,
- unsigned long symoffs,
- unsigned long stroffs,
- const char *secstrings,
- unsigned long *strmap)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
const Elf_Sym *src;
Elf_Sym *dst;
char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- mod->symtab = (void *)sechdrs[symindex].sh_addr;
- mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
- mod->strtab = (void *)sechdrs[strindex].sh_addr;
+ mod->symtab = (void *)symsec->sh_addr;
+ mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
/* Set types up while we still have access to sections. */
for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info
- = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+ mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
- mod->core_symtab = dst = mod->module_core + symoffs;
+ mod->core_symtab = dst = mod->module_core + info->symoffs;
src = mod->symtab;
*dst = *src;
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
- if (!is_core_symbol(src, sechdrs, shnum))
+ if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
continue;
dst[ndst] = *src;
- dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
+ dst[ndst].st_name = bitmap_weight(info->strmap,
+ dst[ndst].st_name);
++ndst;
}
mod->core_num_syms = ndst;
- mod->core_strtab = s = mod->module_core + stroffs;
- for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
- if (test_bit(i, strmap))
+ mod->core_strtab = s = mod->module_core + info->stroffs;
+ for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
+ if (test_bit(i, info->strmap))
*++s = mod->strtab[i];
}
#else
-static inline unsigned long layout_symtab(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- unsigned int strindex,
- const Elf_Ehdr *hdr,
- const char *secstrings,
- unsigned long *pstroffs,
- unsigned long *strmap)
+static inline void layout_symtab(struct module *mod, struct load_info *info)
{
- return 0;
}
-static inline void add_kallsyms(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int shnum,
- unsigned int symindex,
- unsigned int strindex,
- unsigned long symoffs,
- unsigned long stroffs,
- const char *secstrings,
- const unsigned long *strmap)
+static void add_kallsyms(struct module *mod, struct load_info *info)
{
}
#endif /* CONFIG_KALLSYMS */
static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
+ if (!debug)
+ return;
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -1953,77 +2051,70 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
#endif
}
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+ if (debug)
+ ddebug_remove_module(debug->modname);
+}
+
static void *module_alloc_update_bounds(unsigned long size)
{
void *ret = module_alloc(size);
if (ret) {
+ mutex_lock(&module_mutex);
/* Update module bounds. */
if ((unsigned long)ret < module_addr_min)
module_addr_min = (unsigned long)ret;
if ((unsigned long)ret + size > module_addr_max)
module_addr_max = (unsigned long)ret + size;
+ mutex_unlock(&module_mutex);
}
return ret;
}
#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs, char *secstrings)
+static void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
{
unsigned int i;
/* only scan the sections containing data */
kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
- for (i = 1; i < hdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ const char *name = info->secstrings + info->sechdrs[i].sh_name;
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
continue;
- if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
- && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+ if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
continue;
- kmemleak_scan_area((void *)sechdrs[i].sh_addr,
- sechdrs[i].sh_size, GFP_KERNEL);
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
}
}
#else
-static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs, char *secstrings)
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
{
}
#endif
-/* Allocate and load the module: note that size of section 0 is always
- zero, and we rely on this for optional sections. */
-static noinline struct module *load_module(void __user *umod,
- unsigned long len,
- const char __user *uargs)
+/* Sets info->hdr and info->len. */
+static int copy_and_check(struct load_info *info,
+ const void __user *umod, unsigned long len,
+ const char __user *uargs)
{
+ int err;
Elf_Ehdr *hdr;
- Elf_Shdr *sechdrs;
- char *secstrings, *args, *modmagic, *strtab = NULL;
- char *staging;
- unsigned int i;
- unsigned int symindex = 0;
- unsigned int strindex = 0;
- unsigned int modindex, versindex, infoindex, pcpuindex;
- struct module *mod;
- long err = 0;
- void *ptr = NULL; /* Stops spurious gcc warning */
- unsigned long symoffs, stroffs, *strmap;
-
- mm_segment_t old_fs;
- DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
if (len < sizeof(*hdr))
- return ERR_PTR(-ENOEXEC);
+ return -ENOEXEC;
/* Suck in entire file: we'll want most of it. */
/* vmalloc barfs on "unusual" numbers. Check here */
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
@@ -2031,138 +2122,225 @@ static noinline struct module *load_module(void __user *umod,
}
/* Sanity checks against insmoding binaries or wrong arch,
- weird elf version */
+ weird elf version */
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
|| hdr->e_type != ET_REL
|| !elf_check_arch(hdr)
- || hdr->e_shentsize != sizeof(*sechdrs)) {
+ || hdr->e_shentsize != sizeof(Elf_Shdr)) {
err = -ENOEXEC;
goto free_hdr;
}
- if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
- goto truncated;
+ if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
- /* Convenience variables */
- sechdrs = (void *)hdr + hdr->e_shoff;
- secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- sechdrs[0].sh_addr = 0;
+ info->hdr = hdr;
+ info->len = len;
+ return 0;
- for (i = 1; i < hdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_NOBITS
- && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
- goto truncated;
+free_hdr:
+ vfree(hdr);
+ return err;
+}
+
+static void free_copy(struct load_info *info)
+{
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (shdr->sh_type != SHT_NOBITS
+ && info->len < shdr->sh_offset + shdr->sh_size) {
+ printk(KERN_ERR "Module len %lu truncated\n",
+ info->len);
+ return -ENOEXEC;
+ }
/* Mark all sections sh_addr with their address in the
temporary image. */
- sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
- /* Internal symbols and strings. */
- if (sechdrs[i].sh_type == SHT_SYMTAB) {
- symindex = i;
- strindex = sechdrs[i].sh_link;
- strtab = (char *)hdr + sechdrs[strindex].sh_offset;
- }
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
- if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
- sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+ shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
- modindex = find_sec(hdr, sechdrs, secstrings,
- ".gnu.linkonce.this_module");
- if (!modindex) {
+ /* Track but don't keep modinfo and version sections. */
+ info->index.vers = find_sec(info, "__versions");
+ info->index.info = find_sec(info, ".modinfo");
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ return 0;
+}
+
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Return the temporary module pointer (we'll replace it with the final
+ * one when we move the module sections around).
+ */
+static struct module *setup_load_info(struct load_info *info)
+{
+ unsigned int i;
+ int err;
+ struct module *mod;
+
+ /* Set up the convenience variables */
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+ info->secstrings = (void *)info->hdr
+ + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
+
+ err = rewrite_section_headers(info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Find internal symbols and strings. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ info->index.sym = i;
+ info->index.str = info->sechdrs[i].sh_link;
+ info->strtab = (char *)info->hdr
+ + info->sechdrs[info->index.str].sh_offset;
+ break;
+ }
+ }
+
+ info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+ if (!info->index.mod) {
printk(KERN_WARNING "No module found in object\n");
- err = -ENOEXEC;
- goto free_hdr;
+ return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
- mod = (void *)sechdrs[modindex].sh_addr;
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
- if (symindex == 0) {
+ if (info->index.sym == 0) {
printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
mod->name);
- err = -ENOEXEC;
- goto free_hdr;
+ return ERR_PTR(-ENOEXEC);
}
- versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
- infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
- pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-
- /* Don't keep modinfo and version sections. */
- sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
- sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->index.pcpu = find_pcpusec(info);
/* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(sechdrs, versindex, mod)) {
- err = -ENOEXEC;
- goto free_hdr;
- }
+ if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+ return ERR_PTR(-ENOEXEC);
+
+ return mod;
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
- modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "bad vermagic");
if (err)
- goto free_hdr;
- } else if (!same_magic(modmagic, vermagic, versindex)) {
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
- err = -ENOEXEC;
- goto free_hdr;
+ return -ENOEXEC;
}
- staging = get_modinfo(sechdrs, infoindex, "staging");
- if (staging) {
+ if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP);
printk(KERN_WARNING "%s: module is from the staging directory,"
" the quality is unknown, you have been warned.\n",
mod->name);
}
- /* Now copy in args */
- args = strndup_user(uargs, ~0UL >> 1);
- if (IS_ERR(args)) {
- err = PTR_ERR(args);
- goto free_hdr;
- }
+ /* Set up license info based on the info section */
+ set_license(mod, get_modinfo(info, "license"));
- strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
- * sizeof(long), GFP_KERNEL);
- if (!strmap) {
- err = -ENOMEM;
- goto free_mod;
- }
+ return 0;
+}
- if (find_module(mod->name)) {
- err = -EEXIST;
- goto free_mod;
- }
+static void find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(info,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
- mod->state = MODULE_STATE_COMING;
+#ifdef CONFIG_UNUSED_SYMBOLS
+ mod->unused_syms = section_objs(info, "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+#endif
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+#endif
- /* Allow arches to frob section contents and sizes. */
- err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
- if (err < 0)
- goto free_mod;
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = section_objs(info, "__tracepoints",
+ sizeof(*mod->tracepoints),
+ &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ /*
+ * This section contains pointers to allocated objects in the trace
+ * code and not scanning it leads to false positives.
+ */
+ kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+ mod->num_trace_events, GFP_KERNEL);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
- if (pcpuindex) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
- sechdrs[pcpuindex].sh_addralign);
- if (err)
- goto free_mod;
- sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
- /* Determine total sizes, and put offsets in sh_entsize. For now
- this is done generically; there doesn't appear to be any
- special cases for the architectures. */
- layout_sections(mod, hdr, sechdrs, secstrings);
- symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
- secstrings, &stroffs, strmap);
+ if (section_addr(info, "__obsparm"))
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ mod->name);
+
+ info->debug = section_objs(info, "__verbose",
+ sizeof(*info->debug), &info->num_debug);
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i;
+ void *ptr;
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);
@@ -2172,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod,
* leak.
*/
kmemleak_not_leak(ptr);
- if (!ptr) {
- err = -ENOMEM;
- goto free_percpu;
- }
+ if (!ptr)
+ return -ENOMEM;
+
memset(ptr, 0, mod->core_size);
mod->module_core = ptr;
@@ -2188,55 +2365,40 @@ static noinline struct module *load_module(void __user *umod,
*/
kmemleak_ignore(ptr);
if (!ptr && mod->init_size) {
- err = -ENOMEM;
- goto free_core;
+ module_free(mod, mod->module_core);
+ return -ENOMEM;
}
memset(ptr, 0, mod->init_size);
mod->module_init = ptr;
/* Transfer each section which specifies SHF_ALLOC */
DEBUGP("final section addresses:\n");
- for (i = 0; i < hdr->e_shnum; i++) {
+ for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+ if (shdr->sh_entsize & INIT_OFFSET_MASK)
dest = mod->module_init
- + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+ + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + sechdrs[i].sh_entsize;
+ dest = mod->module_core + shdr->sh_entsize;
- if (sechdrs[i].sh_type != SHT_NOBITS)
- memcpy(dest, (void *)sechdrs[i].sh_addr,
- sechdrs[i].sh_size);
+ if (shdr->sh_type != SHT_NOBITS)
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
/* Update sh_addr to point to copy in image. */
- sechdrs[i].sh_addr = (unsigned long)dest;
- DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
- }
- /* Module has been moved. */
- mod = (void *)sechdrs[modindex].sh_addr;
- kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-
-#if defined(CONFIG_MODULE_UNLOAD)
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr) {
- err = -ENOMEM;
- goto free_init;
+ shdr->sh_addr = (unsigned long)dest;
+ DEBUGP("\t0x%lx %s\n",
+ shdr->sh_addr, info->secstrings + shdr->sh_name);
}
-#endif
- /* Now we've moved module, initialize linked lists, etc. */
- module_unload_init(mod);
- /* add kobject, so we can reference it. */
- err = mod_sysfs_init(mod);
- if (err)
- goto free_unload;
-
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+ return 0;
+}
+static int check_module_license_and_versions(struct module *mod)
+{
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2249,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod,
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
- /* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, sechdrs, infoindex);
-
- /* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
- mod);
- if (err < 0)
- goto cleanup;
-
- /* Now we've got everything in the final locations, we can
- * find optional sections. */
- mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
- sizeof(*mod->kp), &mod->num_kp);
- mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
- sizeof(*mod->syms), &mod->num_syms);
- mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
- mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
- sizeof(*mod->ctors), &mod->num_ctors);
-#endif
-
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
- "__tracepoints",
- sizeof(*mod->tracepoints),
- &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_EVENT_TRACING
- mod->trace_events = section_objs(hdr, sechdrs, secstrings,
- "_ftrace_events",
- sizeof(*mod->trace_events),
- &mod->num_trace_events);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
- mod->num_trace_events, GFP_KERNEL);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
- /* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
- "__mcount_loc",
- sizeof(*mod->ftrace_callsites),
- &mod->num_ftrace_callsites);
-#endif
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2329,67 +2420,16 @@ static noinline struct module *load_module(void __user *umod,
|| (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
- err = try_to_force_load(mod,
- "no versions for exported symbols");
- if (err)
- goto cleanup;
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
}
#endif
+ return 0;
+}
- /* Now do relocations. */
- for (i = 1; i < hdr->e_shnum; i++) {
- const char *strtab = (char *)sechdrs[strindex].sh_addr;
- unsigned int info = sechdrs[i].sh_info;
-
- /* Not a valid relocation section? */
- if (info >= hdr->e_shnum)
- continue;
-
- /* Don't bother with non-allocated sections */
- if (!(sechdrs[info].sh_flags & SHF_ALLOC))
- continue;
-
- if (sechdrs[i].sh_type == SHT_REL)
- err = apply_relocate(sechdrs, strtab, symindex, i,mod);
- else if (sechdrs[i].sh_type == SHT_RELA)
- err = apply_relocate_add(sechdrs, strtab, symindex, i,
- mod);
- if (err < 0)
- goto cleanup;
- }
-
- /* Find duplicate symbols */
- err = verify_export_symbols(mod);
- if (err < 0)
- goto cleanup;
-
- /* Set up and sort exception table */
- mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
- sizeof(*mod->extable), &mod->num_exentries);
- sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
- /* Finally, copy percpu area over. */
- percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
- sechdrs[pcpuindex].sh_size);
-
- add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
- symoffs, stroffs, secstrings, strmap);
- kfree(strmap);
- strmap = NULL;
-
- if (!mod->taints) {
- struct _ddebug *debug;
- unsigned int num_debug;
-
- debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
- sizeof(*debug), &num_debug);
- if (debug)
- dynamic_debug_setup(debug, num_debug);
- }
-
- err = module_finalize(hdr, sechdrs, mod);
- if (err < 0)
- goto cleanup;
+static void flush_module_icache(const struct module *mod)
+{
+ mm_segment_t old_fs;
/* flush the icache in correct context */
old_fs = get_fs();
@@ -2408,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod,
(unsigned long)mod->module_core + mod->core_size);
set_fs(old_fs);
+}
- mod->args = args;
- if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
- printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
- mod->name);
+static struct module *layout_and_allocate(struct load_info *info)
+{
+ /* Module within temporary copy. */
+ struct module *mod;
+ Elf_Shdr *pcpusec;
+ int err;
+
+ mod = setup_load_info(info);
+ if (IS_ERR(mod))
+ return mod;
+
+ err = check_modinfo(mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, mod);
+ if (err < 0)
+ goto out;
+
+ pcpusec = &info->sechdrs[info->index.pcpu];
+ if (pcpusec->sh_size) {
+ /* We have a special allocation for this section. */
+ err = percpu_modalloc(mod,
+ pcpusec->sh_size, pcpusec->sh_addralign);
+ if (err)
+ goto out;
+ pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
+ }
+
+ /* Determine total sizes, and put offsets in sh_entsize. For now
+ this is done generically; there doesn't appear to be any
+ special cases for the architectures. */
+ layout_sections(mod, info);
+
+ info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
+ * sizeof(long), GFP_KERNEL);
+ if (!info->strmap) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ layout_symtab(mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(mod, info);
+ if (err)
+ goto free_strmap;
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ return mod;
+
+free_strmap:
+ kfree(info->strmap);
+free_percpu:
+ percpu_modfree(mod);
+out:
+ return ERR_PTR(err);
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ kfree(info->strmap);
+ percpu_modfree(mod);
+ module_free(mod, mod->module_init);
+ module_free(mod, mod->module_core);
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Allocate and load the module: note that size of section 0 is always
+ zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+ unsigned long len,
+ const char __user *uargs)
+{
+ struct load_info info = { NULL, };
+ struct module *mod;
+ long err;
+
+ DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ /* Copy in the blobs from userspace, check they are vaguely sane. */
+ err = copy_and_check(&info, umod, len, uargs);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(&info);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto free_module;
+
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ find_module_sections(mod, &info);
+
+ err = check_module_license_and_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, &info);
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ /* Mark state as coming so strong_try_module_get() ignores us. */
+ mod->state = MODULE_STATE_COMING;
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. Noone should access us, since
@@ -2421,58 +2610,64 @@ static noinline struct module *load_module(void __user *umod,
* function to insert in a way safe to concurrent readers.
* The mutex protects against concurrent writers.
*/
+ mutex_lock(&module_mutex);
+ if (find_module(mod->name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ /* This has to be done once we're sure module name is unique. */
+ if (!mod->taints)
+ dynamic_debug_setup(info.debug, info.num_debug);
+
+ /* Find duplicate symbols */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto ddebug;
+
list_add_rcu(&mod->list, &modules);
+ mutex_unlock(&module_mutex);
+ /* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
goto unlink;
- err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
+ /* Link in to syfs. */
+ err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
- add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
- add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
-
- /* Get rid of temporary copy */
- vfree(hdr);
- trace_module_load(mod);
+ /* Get rid of temporary copy and strmap. */
+ kfree(info.strmap);
+ free_copy(&info);
/* Done! */
+ trace_module_load(mod);
return mod;
unlink:
+ mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ ddebug:
+ if (!mod->taints)
+ dynamic_debug_remove(info.debug);
+ unlock:
+ mutex_unlock(&module_mutex);
synchronize_sched();
+ kfree(mod->args);
+ free_arch_cleanup:
module_arch_cleanup(mod);
- cleanup:
+ free_modinfo:
free_modinfo(mod);
- kobject_del(&mod->mkobj.kobj);
- kobject_put(&mod->mkobj.kobj);
free_unload:
module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
- free_percpu(mod->refptr);
- free_init:
-#endif
- module_free(mod, mod->module_init);
- free_core:
- module_free(mod, mod->module_core);
- /* mod will be freed with core. Don't access it beyond this line! */
- free_percpu:
- percpu_modfree(mod);
- free_mod:
- kfree(args);
- kfree(strmap);
- free_hdr:
- vfree(hdr);
+ free_module:
+ module_deallocate(mod, &info);
+ free_copy:
+ free_copy(&info);
return ERR_PTR(err);
-
- truncated:
- printk(KERN_ERR "Module len %lu truncated\n", len);
- err = -ENOEXEC;
- goto free_hdr;
}
/* Call module constructors. */
@@ -2497,19 +2692,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
- /* Only one module load at a time, please */
- if (mutex_lock_interruptible(&module_mutex) != 0)
- return -EINTR;
-
/* Do all the hard work */
mod = load_module(umod, len, uargs);
- if (IS_ERR(mod)) {
- mutex_unlock(&module_mutex);
+ if (IS_ERR(mod))
return PTR_ERR(mod);
- }
-
- /* Drop lock so they can recurse */
- mutex_unlock(&module_mutex);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
@@ -2526,9 +2712,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
- mutex_lock(&module_mutex);
free_module(mod);
- mutex_unlock(&module_mutex);
wake_up(&module_wq);
return ret;
}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct thread_info *owner;
/*
+ * If we own the BKL, then don't spin. The owner of
+ * the mutex might be waiting on us to release the BKL.
+ */
+ if (unlikely(current->lock_depth >= 0))
+ break;
+
+ /*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
diff --git a/kernel/padata.c b/kernel/padata.c
index fd03513c7327..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#define MAX_SEQ_NR INT_MAX - NR_CPUS
-#define MAX_OBJ_NUM 10000 * NR_CPUS
+#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
+#define MAX_OBJ_NUM 1000
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
- target_cpu = cpumask_first(pd->cpumask);
+ target_cpu = cpumask_first(pd->cpumask.pcpu);
for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask);
+ target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
return target_cpu;
}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
- cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
+ cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
return padata_index_to_cpu(pd, cpu_index);
}
-static void padata_parallel_worker(struct work_struct *work)
+static void padata_parallel_worker(struct work_struct *parallel_work)
{
- struct padata_queue *queue;
+ struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
- queue = container_of(work, struct padata_queue, pwork);
- pd = queue->pd;
+ pqueue = container_of(parallel_work,
+ struct padata_parallel_queue, work);
+ pd = pqueue->pd;
pinst = pd->pinst;
- spin_lock(&queue->parallel.lock);
- list_replace_init(&queue->parallel.list, &local_list);
- spin_unlock(&queue->parallel.lock);
+ spin_lock(&pqueue->parallel.lock);
+ list_replace_init(&pqueue->parallel.list, &local_list);
+ spin_unlock(&pqueue->parallel.lock);
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -88,13 +90,13 @@ static void padata_parallel_worker(struct work_struct *work)
local_bh_enable();
}
-/*
+/**
* padata_do_parallel - padata parallelization function
*
* @pinst: padata instance
* @padata: object to be parallelized
* @cb_cpu: cpu the serialization callback function will run on,
- * must be in the cpumask of padata.
+ * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
*
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
struct padata_priv *padata, int cb_cpu)
{
int target_cpu, err;
- struct padata_queue *queue;
+ struct padata_parallel_queue *queue;
struct parallel_data *pd;
rcu_read_lock_bh();
pd = rcu_dereference(pinst->pd);
- err = 0;
- if (!(pinst->flags & PADATA_INIT))
+ err = -EINVAL;
+ if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
+ goto out;
+
+ if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
goto out;
err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
goto out;
- err = -EINVAL;
- if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
- goto out;
-
- err = -EINPROGRESS;
+ err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->seq_nr = atomic_inc_return(&pd->seq_nr);
target_cpu = padata_cpu_hash(padata);
- queue = per_cpu_ptr(pd->queue, target_cpu);
+ queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
list_add_tail(&padata->list, &queue->parallel.list);
spin_unlock(&queue->parallel.lock);
- queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+ queue_work_on(target_cpu, pinst->wq, &queue->work);
out:
rcu_read_unlock_bh();
@@ -152,86 +153,72 @@ out:
}
EXPORT_SYMBOL(padata_do_parallel);
+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ * the next object.
+ */
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
- int cpu, num_cpus, empty, calc_seq_nr;
- int seq_nr, next_nr, overrun, next_overrun;
- struct padata_queue *queue, *next_queue;
+ int cpu, num_cpus;
+ int next_nr, next_index;
+ struct padata_parallel_queue *queue, *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
- empty = 0;
- next_nr = -1;
- next_overrun = 0;
- next_queue = NULL;
-
- num_cpus = cpumask_weight(pd->cpumask);
-
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- reorder = &queue->reorder;
-
- /*
- * Calculate the seq_nr of the object that should be
- * next in this queue.
- */
- overrun = 0;
- calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
- + queue->cpu_index;
-
- if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
- calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
- overrun = 1;
- }
-
- if (!list_empty(&reorder->list)) {
- padata = list_entry(reorder->list.next,
- struct padata_priv, list);
-
- seq_nr = padata->seq_nr;
- BUG_ON(calc_seq_nr != seq_nr);
- } else {
- seq_nr = calc_seq_nr;
- empty++;
- }
+ num_cpus = cpumask_weight(pd->cpumask.pcpu);
- if (next_nr < 0 || seq_nr < next_nr
- || (next_overrun && !overrun)) {
- next_nr = seq_nr;
- next_overrun = overrun;
- next_queue = queue;
- }
+ /*
+ * Calculate the percpu reorder queue and the sequence
+ * number of the next object.
+ */
+ next_nr = pd->processed;
+ next_index = next_nr % num_cpus;
+ cpu = padata_index_to_cpu(pd, next_index);
+ next_queue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (unlikely(next_nr > pd->max_seq_nr)) {
+ next_nr = next_nr - pd->max_seq_nr - 1;
+ next_index = next_nr % num_cpus;
+ cpu = padata_index_to_cpu(pd, next_index);
+ next_queue = per_cpu_ptr(pd->pqueue, cpu);
+ pd->processed = 0;
}
padata = NULL;
- if (empty == num_cpus)
- goto out;
-
reorder = &next_queue->reorder;
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- if (unlikely(next_overrun)) {
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- atomic_set(&queue->num_obj, 0);
- }
- }
+ BUG_ON(next_nr != padata->seq_nr);
spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
spin_unlock(&reorder->lock);
- atomic_inc(&next_queue->num_obj);
+ pd->processed++;
goto out;
}
- if (next_nr % num_cpus == next_queue->cpu_index) {
+ queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
+ if (queue->cpu_index == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
goto out;
}
@@ -244,55 +231,90 @@ out:
static void padata_reorder(struct parallel_data *pd)
{
struct padata_priv *padata;
- struct padata_queue *queue;
+ struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
-try_again:
+ /*
+ * We need to ensure that only one cpu can work on dequeueing of
+ * the reorder queue the time. Calculating in which percpu reorder
+ * queue the next object will arrive takes some time. A spinlock
+ * would be highly contended. Also it is not clear in which order
+ * the objects arrive to the reorder queues. So a cpu could wait to
+ * get the lock just to notice that there is nothing to do at the
+ * moment. Therefore we use a trylock and let the holder of the lock
+ * care for all the objects enqueued during the holdtime of the lock.
+ */
if (!spin_trylock_bh(&pd->lock))
- goto out;
+ return;
while (1) {
padata = padata_get_next(pd);
+ /*
+ * All reorder queues are empty, or the next object that needs
+ * serialization is parallel processed by another cpu and is
+ * still on it's way to the cpu's reorder queue, nothing to
+ * do for now.
+ */
if (!padata || PTR_ERR(padata) == -EINPROGRESS)
break;
+ /*
+ * This cpu has to do the parallel processing of the next
+ * object. It's waiting in the cpu's parallelization queue,
+ * so exit imediately.
+ */
if (PTR_ERR(padata) == -ENODATA) {
+ del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
- goto out;
+ return;
}
- queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+ squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
- spin_lock(&queue->serial.lock);
- list_add_tail(&padata->list, &queue->serial.list);
- spin_unlock(&queue->serial.lock);
+ spin_lock(&squeue->serial.lock);
+ list_add_tail(&padata->list, &squeue->serial.list);
+ spin_unlock(&squeue->serial.lock);
- queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+ queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
}
spin_unlock_bh(&pd->lock);
- if (atomic_read(&pd->reorder_objects))
- goto try_again;
+ /*
+ * The next object that needs serialization might have arrived to
+ * the reorder queues in the meantime, we will be called again
+ * from the timer function if noone else cares for it.
+ */
+ if (atomic_read(&pd->reorder_objects)
+ && !(pinst->flags & PADATA_RESET))
+ mod_timer(&pd->timer, jiffies + HZ);
+ else
+ del_timer(&pd->timer);
-out:
return;
}
-static void padata_serial_worker(struct work_struct *work)
+static void padata_reorder_timer(unsigned long arg)
+{
+ struct parallel_data *pd = (struct parallel_data *)arg;
+
+ padata_reorder(pd);
+}
+
+static void padata_serial_worker(struct work_struct *serial_work)
{
- struct padata_queue *queue;
+ struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
local_bh_disable();
- queue = container_of(work, struct padata_queue, swork);
- pd = queue->pd;
+ squeue = container_of(serial_work, struct padata_serial_queue, work);
+ pd = squeue->pd;
- spin_lock(&queue->serial.lock);
- list_replace_init(&queue->serial.list, &local_list);
- spin_unlock(&queue->serial.lock);
+ spin_lock(&squeue->serial.lock);
+ list_replace_init(&squeue->serial.list, &local_list);
+ spin_unlock(&squeue->serial.lock);
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -308,7 +330,7 @@ static void padata_serial_worker(struct work_struct *work)
local_bh_enable();
}
-/*
+/**
* padata_do_serial - padata serialization function
*
* @padata: object to be serialized.
@@ -319,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
void padata_do_serial(struct padata_priv *padata)
{
int cpu;
- struct padata_queue *queue;
+ struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
pd = padata->pd;
cpu = get_cpu();
- queue = per_cpu_ptr(pd->queue, cpu);
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
- spin_lock(&queue->reorder.lock);
+ spin_lock(&pqueue->reorder.lock);
atomic_inc(&pd->reorder_objects);
- list_add_tail(&padata->list, &queue->reorder.list);
- spin_unlock(&queue->reorder.lock);
+ list_add_tail(&padata->list, &pqueue->reorder.list);
+ spin_unlock(&pqueue->reorder.lock);
put_cpu();
@@ -338,55 +360,90 @@ void padata_do_serial(struct padata_priv *padata)
}
EXPORT_SYMBOL(padata_do_serial);
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *cpumask)
+static int padata_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
{
- int cpu, cpu_index, num_cpus;
- struct padata_queue *queue;
- struct parallel_data *pd;
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+ return -ENOMEM;
- cpu_index = 0;
+ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pd->cpumask.cbcpu);
+ return -ENOMEM;
+ }
- pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
- if (!pd)
- goto err;
+ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+ return 0;
+}
- pd->queue = alloc_percpu(struct padata_queue);
- if (!pd->queue)
- goto err_free_pd;
+static void __padata_list_init(struct padata_list *pd_list)
+{
+ INIT_LIST_HEAD(&pd_list->list);
+ spin_lock_init(&pd_list->lock);
+}
- if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
- goto err_free_queue;
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_serial_queue *squeue;
+
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ squeue->pd = pd;
+ __padata_list_init(&squeue->serial);
+ INIT_WORK(&squeue->work, padata_serial_worker);
+ }
+}
- for_each_possible_cpu(cpu) {
- queue = per_cpu_ptr(pd->queue, cpu);
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+ int cpu_index, num_cpus, cpu;
+ struct padata_parallel_queue *pqueue;
- queue->pd = pd;
+ cpu_index = 0;
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ pqueue->pd = pd;
+ pqueue->cpu_index = cpu_index;
+ cpu_index++;
+
+ __padata_list_init(&pqueue->reorder);
+ __padata_list_init(&pqueue->parallel);
+ INIT_WORK(&pqueue->work, padata_parallel_worker);
+ atomic_set(&pqueue->num_obj, 0);
+ }
- if (cpumask_test_cpu(cpu, cpumask)
- && cpumask_test_cpu(cpu, cpu_active_mask)) {
- queue->cpu_index = cpu_index;
- cpu_index++;
- } else
- queue->cpu_index = -1;
+ num_cpus = cpumask_weight(pd->cpumask.pcpu);
+ pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
+}
- INIT_LIST_HEAD(&queue->reorder.list);
- INIT_LIST_HEAD(&queue->parallel.list);
- INIT_LIST_HEAD(&queue->serial.list);
- spin_lock_init(&queue->reorder.lock);
- spin_lock_init(&queue->parallel.lock);
- spin_lock_init(&queue->serial.lock);
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ struct parallel_data *pd;
- INIT_WORK(&queue->pwork, padata_parallel_worker);
- INIT_WORK(&queue->swork, padata_serial_worker);
- atomic_set(&queue->num_obj, 0);
- }
+ pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+ if (!pd)
+ goto err;
- cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+ pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+ if (!pd->pqueue)
+ goto err_free_pd;
- num_cpus = cpumask_weight(pd->cpumask);
- pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+ pd->squeue = alloc_percpu(struct padata_serial_queue);
+ if (!pd->squeue)
+ goto err_free_pqueue;
+ if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ goto err_free_squeue;
+ padata_init_pqueues(pd);
+ padata_init_squeues(pd);
+ setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
@@ -395,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
return pd;
-err_free_queue:
- free_percpu(pd->queue);
+err_free_squeue:
+ free_percpu(pd->squeue);
+err_free_pqueue:
+ free_percpu(pd->pqueue);
err_free_pd:
kfree(pd);
err:
@@ -405,15 +464,63 @@ err:
static void padata_free_pd(struct parallel_data *pd)
{
- free_cpumask_var(pd->cpumask);
- free_percpu(pd->queue);
+ free_cpumask_var(pd->cpumask.pcpu);
+ free_cpumask_var(pd->cpumask.cbcpu);
+ free_percpu(pd->pqueue);
+ free_percpu(pd->squeue);
kfree(pd);
}
+/* Flush all objects out of the padata queues. */
+static void padata_flush_queues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_parallel_queue *pqueue;
+ struct padata_serial_queue *squeue;
+
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ flush_work(&pqueue->work);
+ }
+
+ del_timer_sync(&pd->timer);
+
+ if (atomic_read(&pd->reorder_objects))
+ padata_reorder(pd);
+
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ flush_work(&squeue->work);
+ }
+
+ BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+
+static void __padata_start(struct padata_instance *pinst)
+{
+ pinst->flags |= PADATA_INIT;
+}
+
+static void __padata_stop(struct padata_instance *pinst)
+{
+ if (!(pinst->flags & PADATA_INIT))
+ return;
+
+ pinst->flags &= ~PADATA_INIT;
+
+ synchronize_rcu();
+
+ get_online_cpus();
+ padata_flush_queues(pinst->pd);
+ put_online_cpus();
+}
+
+/* Replace the internal control stucture with a new one. */
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
struct parallel_data *pd_old = pinst->pd;
+ int notification_mask = 0;
pinst->flags |= PADATA_RESET;
@@ -421,43 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
synchronize_rcu();
- while (atomic_read(&pd_old->refcnt) != 0)
- yield();
-
- flush_workqueue(pinst->wq);
+ if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+ notification_mask |= PADATA_CPU_PARALLEL;
+ if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+ notification_mask |= PADATA_CPU_SERIAL;
+ padata_flush_queues(pd_old);
padata_free_pd(pd_old);
+ if (notification_mask)
+ blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+ notification_mask,
+ &pd_new->cpumask);
+
pinst->flags &= ~PADATA_RESET;
}
-/*
- * padata_set_cpumask - set the cpumask that padata should use
+/**
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ * if either pcpu or cbcpu or both cpumasks change.
*
- * @pinst: padata instance
- * @cpumask: the cpumask to use
+ * @pinst: A poineter to padata instance
+ * @nblock: A pointer to notifier block.
*/
-int padata_set_cpumask(struct padata_instance *pinst,
- cpumask_var_t cpumask)
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
{
- struct parallel_data *pd;
- int err = 0;
+ return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
- might_sleep();
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ * registered earlier using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
+{
+ return blocking_notifier_chain_unregister(
+ &pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
- mutex_lock(&pinst->lock);
- pd = padata_alloc_pd(pinst, cpumask);
- if (!pd) {
- err = -ENOMEM;
- goto out;
+/* If cpumask contains no active cpu, we mark the instance as invalid. */
+static bool padata_validate_cpumask(struct padata_instance *pinst,
+ const struct cpumask *cpumask)
+{
+ if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+ pinst->flags |= PADATA_INVALID;
+ return false;
+ }
+
+ pinst->flags &= ~PADATA_INVALID;
+ return true;
+}
+
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+ cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int valid;
+ struct parallel_data *pd;
+
+ valid = padata_validate_cpumask(pinst, pcpumask);
+ if (!valid) {
+ __padata_stop(pinst);
+ goto out_replace;
}
- cpumask_copy(pinst->cpumask, cpumask);
+ valid = padata_validate_cpumask(pinst, cbcpumask);
+ if (!valid)
+ __padata_stop(pinst);
+
+out_replace:
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
padata_replace(pinst, pd);
+ if (valid)
+ __padata_start(pinst);
+
+ return 0;
+}
+
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ * one is used by parallel workers and the second one
+ * by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int err;
+
+ mutex_lock(&pinst->lock);
+ get_online_cpus();
+
+ err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+
+ put_online_cpus();
+ mutex_unlock(&pinst->lock);
+
+ return err;
+
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
+ *
+ * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ * to parallel and serial cpumasks respectively.
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+ cpumask_var_t cpumask)
+{
+ struct cpumask *serial_mask, *parallel_mask;
+ int err = -EINVAL;
+
+ mutex_lock(&pinst->lock);
+ get_online_cpus();
+
+ switch (cpumask_type) {
+ case PADATA_CPU_PARALLEL:
+ serial_mask = pinst->cpumask.cbcpu;
+ parallel_mask = cpumask;
+ break;
+ case PADATA_CPU_SERIAL:
+ parallel_mask = pinst->cpumask.pcpu;
+ serial_mask = cpumask;
+ break;
+ default:
+ goto out;
+ }
+
+ err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
+
out:
+ put_online_cpus();
mutex_unlock(&pinst->lock);
return err;
@@ -469,32 +695,50 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
struct parallel_data *pd;
if (cpumask_test_cpu(cpu, cpu_active_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask);
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
if (!pd)
return -ENOMEM;
padata_replace(pinst, pd);
+
+ if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+ padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_start(pinst);
}
return 0;
}
-/*
- * padata_add_cpu - add a cpu to the padata cpumask
+ /**
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ * padata cpumasks.
*
* @pinst: padata instance
* @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
*/
-int padata_add_cpu(struct padata_instance *pinst, int cpu)
+
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
{
int err;
- might_sleep();
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
mutex_lock(&pinst->lock);
- cpumask_set_cpu(cpu, pinst->cpumask);
+ get_online_cpus();
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
+
err = __padata_add_cpu(pinst, cpu);
+ put_online_cpus();
mutex_unlock(&pinst->lock);
@@ -504,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ struct parallel_data *pd = NULL;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask);
+
+ if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+ !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_stop(pinst);
+
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
if (!pd)
return -ENOMEM;
@@ -517,22 +767,34 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
return 0;
}
-/*
- * padata_remove_cpu - remove a cpu from the padata cpumask
+ /**
+ * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata cpumasks.
*
* @pinst: padata instance
* @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
*/
-int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
{
int err;
- might_sleep();
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
mutex_lock(&pinst->lock);
- cpumask_clear_cpu(cpu, pinst->cpumask);
+ get_online_cpus();
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
+
err = __padata_remove_cpu(pinst, cpu);
+ put_online_cpus();
mutex_unlock(&pinst->lock);
@@ -540,38 +802,52 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
}
EXPORT_SYMBOL(padata_remove_cpu);
-/*
+/**
* padata_start - start the parallel processing
*
* @pinst: padata instance to start
*/
-void padata_start(struct padata_instance *pinst)
+int padata_start(struct padata_instance *pinst)
{
- might_sleep();
+ int err = 0;
mutex_lock(&pinst->lock);
- pinst->flags |= PADATA_INIT;
+
+ if (pinst->flags & PADATA_INVALID)
+ err =-EINVAL;
+
+ __padata_start(pinst);
+
mutex_unlock(&pinst->lock);
+
+ return err;
}
EXPORT_SYMBOL(padata_start);
-/*
+/**
* padata_stop - stop the parallel processing
*
* @pinst: padata instance to stop
*/
void padata_stop(struct padata_instance *pinst)
{
- might_sleep();
-
mutex_lock(&pinst->lock);
- pinst->flags &= ~PADATA_INIT;
+ __padata_stop(pinst);
mutex_unlock(&pinst->lock);
}
EXPORT_SYMBOL(padata_stop);
-static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+#ifdef CONFIG_HOTPLUG_CPU
+
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+ return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+ cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
+
+
+static int padata_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
int err;
struct padata_instance *pinst;
@@ -582,29 +858,29 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
err = __padata_add_cpu(pinst, cpu);
mutex_unlock(&pinst->lock);
if (err)
- return NOTIFY_BAD;
+ return notifier_from_errno(err);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
err = __padata_remove_cpu(pinst, cpu);
mutex_unlock(&pinst->lock);
if (err)
- return NOTIFY_BAD;
+ return notifier_from_errno(err);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
__padata_remove_cpu(pinst, cpu);
@@ -612,7 +888,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
__padata_add_cpu(pinst, cpu);
@@ -621,77 +897,239 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
+#endif
+
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+ padata_stop(pinst);
+ padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
+ kfree(pinst);
+}
+
+#define kobj2pinst(_kobj) \
+ container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr) \
+ container_of(_attr, struct padata_sysfs_entry, attr)
+
+static void padata_sysfs_release(struct kobject *kobj)
+{
+ struct padata_instance *pinst = kobj2pinst(kobj);
+ __padata_free(pinst);
+}
+
+struct padata_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+ ssize_t (*store)(struct padata_instance *, struct attribute *,
+ const char *, size_t);
+};
+
+static ssize_t show_cpumask(struct padata_instance *pinst,
+ struct attribute *attr, char *buf)
+{
+ struct cpumask *cpumask;
+ ssize_t len;
+
+ mutex_lock(&pinst->lock);
+ if (!strcmp(attr->name, "serial_cpumask"))
+ cpumask = pinst->cpumask.cbcpu;
+ else
+ cpumask = pinst->cpumask.pcpu;
+
+ len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+ nr_cpu_ids);
+ if (PAGE_SIZE - len < 2)
+ len = -EINVAL;
+ else
+ len += sprintf(buf + len, "\n");
+
+ mutex_unlock(&pinst->lock);
+ return len;
+}
+
+static ssize_t store_cpumask(struct padata_instance *pinst,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_cpumask;
+ ssize_t ret;
+ int mask_type;
+
+ if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+ nr_cpumask_bits);
+ if (ret < 0)
+ goto out;
+
+ mask_type = !strcmp(attr->name, "serial_cpumask") ?
+ PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+ ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+ if (!ret)
+ ret = count;
+
+out:
+ free_cpumask_var(new_cpumask);
+ return ret;
+}
+
+#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0400, _show_name, NULL)
+
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
/*
- * padata_alloc - allocate and initialize a padata instance
+ * Padata sysfs provides the following objects:
+ * serial_cpumask [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+ &serial_cpumask_attr.attr,
+ &parallel_cpumask_attr.attr,
+ NULL,
+};
+
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->show(pinst, attr, buf);
+
+ return ret;
+}
+
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->store(pinst, attr, buf, count);
+
+ return ret;
+}
+
+static const struct sysfs_ops padata_sysfs_ops = {
+ .show = padata_sysfs_show,
+ .store = padata_sysfs_store,
+};
+
+static struct kobj_type padata_attr_type = {
+ .sysfs_ops = &padata_sysfs_ops,
+ .default_attrs = padata_default_attrs,
+ .release = padata_sysfs_release,
+};
+
+/**
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ * Use the cpu_possible_mask for serial and
+ * parallel workers.
*
- * @cpumask: cpumask that padata uses for parallelization
* @wq: workqueue to use for the allocated padata instance
*/
-struct padata_instance *padata_alloc(const struct cpumask *cpumask,
- struct workqueue_struct *wq)
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+ return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
+/**
+ * padata_alloc - allocate and initialize a padata instance and specify
+ * cpumasks for serial and parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
{
- int err;
struct padata_instance *pinst;
- struct parallel_data *pd;
+ struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
goto err;
- pd = padata_alloc_pd(pinst, cpumask);
- if (!pd)
+ get_online_cpus();
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ goto err_free_inst;
+ if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pinst->cpumask.pcpu);
goto err_free_inst;
+ }
+ if (!padata_validate_cpumask(pinst, pcpumask) ||
+ !padata_validate_cpumask(pinst, cbcpumask))
+ goto err_free_masks;
- if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
- goto err_free_pd;
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ goto err_free_masks;
rcu_assign_pointer(pinst->pd, pd);
pinst->wq = wq;
- cpumask_copy(pinst->cpumask, cpumask);
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
pinst->flags = 0;
+#ifdef CONFIG_HOTPLUG_CPU
pinst->cpu_notifier.notifier_call = padata_cpu_callback;
pinst->cpu_notifier.priority = 0;
- err = register_hotcpu_notifier(&pinst->cpu_notifier);
- if (err)
- goto err_free_cpumask;
+ register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+ put_online_cpus();
+ BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+ kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
return pinst;
-err_free_cpumask:
- free_cpumask_var(pinst->cpumask);
-err_free_pd:
- padata_free_pd(pd);
+err_free_masks:
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
err_free_inst:
kfree(pinst);
+ put_online_cpus();
err:
return NULL;
}
EXPORT_SYMBOL(padata_alloc);
-/*
+/**
* padata_free - free a padata instance
*
- * @ padata_inst: padata instance to free
+ * @padata_inst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
- padata_stop(pinst);
-
- synchronize_rcu();
-
- while (atomic_read(&pinst->pd->refcnt) != 0)
- yield();
-
- unregister_hotcpu_notifier(&pinst->cpu_notifier);
- padata_free_pd(pinst->pd);
- free_cpumask_var(pinst->cpumask);
- kfree(pinst);
+ kobject_put(&pinst->kobj);
}
EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
preempt_disable();
+ console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
@@ -178,6 +179,7 @@ static const struct tnt tnts[] = {
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
+ { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
};
/**
@@ -194,6 +196,7 @@ static const struct tnt tnts[] = {
* 'A' - ACPI table overridden.
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
+ * 'I' - Working around severe firmware bug.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -365,7 +368,8 @@ struct slowpath_args {
va_list args;
};
-static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
+static void warn_slowpath_common(const char *file, int line, void *caller,
+ unsigned taint, struct slowpath_args *args)
{
const char *board;
@@ -381,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
print_modules();
dump_stack();
print_oops_end_marker();
- add_taint(TAINT_WARN);
+ add_taint(taint);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0), &args);
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ TAINT_WARN, &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
+void warn_slowpath_fmt_taint(const char *file, int line,
+ unsigned taint, const char *fmt, ...)
+{
+ struct slowpath_args args;
+
+ args.fmt = fmt;
+ va_start(args.args, fmt);
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ taint, &args);
+ va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt_taint);
+
void warn_slowpath_null(const char *file, int line)
{
- warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ TAINT_WARN, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..403d1804b198 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
static inline u64 perf_clock(void)
{
- return cpu_clock(raw_smp_processor_id());
+ return local_clock();
}
/*
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *group_leader = event->group_leader;
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ event->attach_state |= PERF_ATTACH_CONTEXT;
/*
- * Depending on whether it is a standalone or sibling event,
- * add it straight to the context's event list, or to the group
- * leader's sibling list:
+ * If we're a stand alone event or group leader, we go to the context
+ * list, group events are kept attached to the group so that
+ * perf_group_detach can, at all times, locate all siblings.
*/
- if (group_leader == event) {
+ if (event->group_leader == event) {
struct list_head *list;
if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
- } else {
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
-
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
- group_leader->nr_siblings++;
}
list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_stat++;
}
+static void perf_group_attach(struct perf_event *event)
+{
+ struct perf_event *group_leader = event->group_leader;
+
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+ event->attach_state |= PERF_ATTACH_GROUP;
+
+ if (group_leader == event)
+ return;
+
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+}
+
/*
* Remove a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- if (list_empty(&event->group_entry))
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
+
+ event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
- list_del_init(&event->group_entry);
list_del_rcu(&event->event_entry);
- if (event->group_leader != event)
- event->group_leader->nr_siblings--;
+ if (event->group_leader == event)
+ list_del_init(&event->group_entry);
update_group_times(event);
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->state = PERF_EVENT_STATE_OFF;
}
-static void
-perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
+ struct list_head *list = NULL;
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_GROUP))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_GROUP;
+
+ /*
+ * If this is a sibling, remove it from its group.
+ */
+ if (event->group_leader != event) {
+ list_del_init(&event->group_entry);
+ event->group_leader->nr_siblings--;
+ return;
+ }
+
+ if (!list_empty(&event->group_entry))
+ list = &event->group_entry;
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
- * to the context list directly:
+ * to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- struct list_head *list;
-
- list = ctx_group_list(event, ctx);
- list_move_tail(&sibling->group_entry, list);
+ if (list)
+ list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
@@ -640,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
struct perf_event *event, *partial_group = NULL;
const struct pmu *pmu = group_event->pmu;
bool txn = false;
- int ret;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
@@ -652,8 +686,11 @@ group_sched_in(struct perf_event *group_event,
if (txn)
pmu->start_txn(pmu);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, cpuctx, ctx)) {
+ if (txn)
+ pmu->cancel_txn(pmu);
return -EAGAIN;
+ }
/*
* Schedule in siblings as one group (if any):
@@ -665,19 +702,10 @@ group_sched_in(struct perf_event *group_event,
}
}
- if (!txn)
+ if (!txn || !pmu->commit_txn(pmu))
return 0;
- ret = pmu->commit_txn(pmu);
- if (!ret) {
- pmu->cancel_txn(pmu);
- return 0;
- }
-
group_error:
- if (txn)
- pmu->cancel_txn(pmu);
-
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
@@ -689,6 +717,9 @@ group_error:
}
event_sched_out(group_event, cpuctx, ctx);
+ if (txn)
+ pmu->cancel_txn(pmu);
+
return -EAGAIN;
}
@@ -727,6 +758,7 @@ static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
list_add_event(event, ctx);
+ perf_group_attach(event);
event->tstamp_enabled = ctx->time;
event->tstamp_running = ctx->time;
event->tstamp_stopped = ctx->time;
@@ -1116,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
* In order to keep per-task stats reliable we need to flip the event
* values when we flip the contexts.
*/
- value = atomic64_read(&next_event->count);
- value = atomic64_xchg(&event->count, value);
- atomic64_set(&next_event->count, value);
+ value = local64_read(&next_event->count);
+ value = local64_xchg(&event->count, value);
+ local64_set(&next_event->count, value);
swap(event->total_time_enabled, next_event->total_time_enabled);
swap(event->total_time_running, next_event->total_time_running);
@@ -1468,6 +1500,9 @@ do { \
divisor = nsec * frequency;
}
+ if (!divisor)
+ return dividend;
+
return div64_u64(dividend, divisor);
}
@@ -1490,7 +1525,7 @@ static int perf_event_start(struct perf_event *event)
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
{
struct hw_perf_event *hwc = &event->hw;
- u64 period, sample_period;
+ s64 period, sample_period;
s64 delta;
period = perf_calculate_period(event, nsec, count);
@@ -1505,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
hwc->sample_period = sample_period;
- if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+ if (local64_read(&hwc->period_left) > 8*sample_period) {
perf_disable();
perf_event_stop(event);
- atomic64_set(&hwc->period_left, 0);
+ local64_set(&hwc->period_left, 0);
perf_event_start(event);
perf_enable();
}
@@ -1549,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
perf_disable();
event->pmu->read(event);
- now = atomic64_read(&event->count);
+ now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
@@ -1701,6 +1736,11 @@ static void __perf_event_read(void *info)
event->pmu->read(event);
}
+static inline u64 perf_event_count(struct perf_event *event)
+{
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
static u64 perf_event_read(struct perf_event *event)
{
/*
@@ -1720,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
- return atomic64_read(&event->count);
+ return perf_event_count(event);
}
/*
@@ -1841,6 +1881,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void perf_pending_sync(struct perf_event *event);
+static void perf_buffer_put(struct perf_buffer *buffer);
static void free_event(struct perf_event *event)
{
@@ -1848,7 +1889,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
atomic_dec(&nr_events);
- if (event->attr.mmap)
+ if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
@@ -1856,9 +1897,9 @@ static void free_event(struct perf_event *event)
atomic_dec(&nr_task_events);
}
- if (event->output) {
- fput(event->output->filp);
- event->output = NULL;
+ if (event->buffer) {
+ perf_buffer_put(event->buffer);
+ event->buffer = NULL;
}
if (event->destroy)
@@ -1893,8 +1934,8 @@ int perf_event_release_kernel(struct perf_event *event)
*/
mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
list_del_event(event, ctx);
- perf_destroy_group(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
@@ -2083,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
unsigned int events = POLL_HUP;
rcu_read_lock();
- data = rcu_dereference(event->data);
- if (data)
- events = atomic_xchg(&data->poll, 0);
+ buffer = rcu_dereference(event->buffer);
+ if (buffer)
+ events = atomic_xchg(&buffer->poll, 0);
rcu_read_unlock();
poll_wait(file, &event->waitq, wait);
@@ -2100,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
static void perf_event_reset(struct perf_event *event)
{
(void)perf_event_read(event);
- atomic64_set(&event->count, 0);
+ local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
@@ -2175,7 +2216,27 @@ unlock:
return ret;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+ struct file *file;
+
+ file = fget_light(fd, fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &perf_fops) {
+ fput_light(file, *fput_needed);
+ *fput_needed = 0;
+ return ERR_PTR(-EBADF);
+ }
+
+ return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2263,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return perf_event_period(event, (u64 __user *)arg);
case PERF_EVENT_IOC_SET_OUTPUT:
- return perf_event_set_output(event, arg);
+ {
+ struct perf_event *output_event = NULL;
+ int fput_needed = 0;
+ int ret;
+
+ if (arg != -1) {
+ output_event = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_event))
+ return PTR_ERR(output_event);
+ }
+
+ ret = perf_event_set_output(event, output_event);
+ if (output_event)
+ fput_light(output_event->filp, fput_needed);
+
+ return ret;
+ }
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
@@ -2263,14 +2340,14 @@ static int perf_event_index(struct perf_event *event)
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
rcu_read_lock();
- data = rcu_dereference(event->data);
- if (!data)
+ buffer = rcu_dereference(event->buffer);
+ if (!buffer)
goto unlock;
- userpg = data->user_page;
+ userpg = buffer->user_page;
/*
* Disable preemption so as to not let the corresponding user-space
@@ -2280,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = atomic64_read(&event->count);
+ userpg->offset = perf_event_count(event);
if (event->state == PERF_EVENT_STATE_ACTIVE)
- userpg->offset -= atomic64_read(&event->hw.prev_count);
+ userpg->offset -= local64_read(&event->hw.prev_count);
userpg->time_enabled = event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -2297,9 +2374,23 @@ unlock:
rcu_read_unlock();
}
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
{
- return data->nr_pages << (PAGE_SHIFT + data->data_order);
+ long max_size = perf_data_size(buffer);
+
+ if (watermark)
+ buffer->watermark = min(max_size, watermark);
+
+ if (!buffer->watermark)
+ buffer->watermark = max_size / 2;
+
+ if (flags & PERF_BUFFER_WRITABLE)
+ buffer->writable = 1;
+
+ atomic_set(&buffer->refcount, 1);
}
#ifndef CONFIG_PERF_USE_VMALLOC
@@ -2309,56 +2400,68 @@ static unsigned long perf_data_size(struct perf_mmap_data *data)
*/
static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
{
- if (pgoff > data->nr_pages)
+ if (pgoff > buffer->nr_pages)
return NULL;
if (pgoff == 0)
- return virt_to_page(data->user_page);
+ return virt_to_page(buffer->user_page);
- return virt_to_page(data->data_pages[pgoff - 1]);
+ return virt_to_page(buffer->data_pages[pgoff - 1]);
}
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static void *perf_mmap_alloc_page(int cpu)
{
- struct perf_mmap_data *data;
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct perf_buffer *buffer;
unsigned long size;
int i;
- WARN_ON(atomic_read(&event->mmap_count));
-
- size = sizeof(struct perf_mmap_data);
+ size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- data = kzalloc(size, GFP_KERNEL);
- if (!data)
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
goto fail;
- data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
- if (!data->user_page)
+ buffer->user_page = perf_mmap_alloc_page(cpu);
+ if (!buffer->user_page)
goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
- data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
- if (!data->data_pages[i])
+ buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
+ if (!buffer->data_pages[i])
goto fail_data_pages;
}
- data->data_order = 0;
- data->nr_pages = nr_pages;
+ buffer->nr_pages = nr_pages;
- return data;
+ perf_buffer_init(buffer, watermark, flags);
+
+ return buffer;
fail_data_pages:
for (i--; i >= 0; i--)
- free_page((unsigned long)data->data_pages[i]);
+ free_page((unsigned long)buffer->data_pages[i]);
- free_page((unsigned long)data->user_page);
+ free_page((unsigned long)buffer->user_page);
fail_user_page:
- kfree(data);
+ kfree(buffer);
fail:
return NULL;
@@ -2372,14 +2475,19 @@ static void perf_mmap_free_page(unsigned long addr)
__free_page(page);
}
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
{
int i;
- perf_mmap_free_page((unsigned long)data->user_page);
- for (i = 0; i < data->nr_pages; i++)
- perf_mmap_free_page((unsigned long)data->data_pages[i]);
- kfree(data);
+ perf_mmap_free_page((unsigned long)buffer->user_page);
+ for (i = 0; i < buffer->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
+ kfree(buffer);
+}
+
+static inline int page_order(struct perf_buffer *buffer)
+{
+ return 0;
}
#else
@@ -2390,13 +2498,18 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
* Required for architectures that have d-cache aliasing issues.
*/
+static inline int page_order(struct perf_buffer *buffer)
+{
+ return buffer->page_order;
+}
+
static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
{
- if (pgoff > (1UL << data->data_order))
+ if (pgoff > (1UL << page_order(buffer)))
return NULL;
- return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+ return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
}
static void perf_mmap_unmark_page(void *addr)
@@ -2406,59 +2519,59 @@ static void perf_mmap_unmark_page(void *addr)
page->mapping = NULL;
}
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
{
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
void *base;
int i, nr;
- data = container_of(work, struct perf_mmap_data, work);
- nr = 1 << data->data_order;
+ buffer = container_of(work, struct perf_buffer, work);
+ nr = 1 << page_order(buffer);
- base = data->user_page;
+ base = buffer->user_page;
for (i = 0; i < nr + 1; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
- kfree(data);
+ kfree(buffer);
}
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
{
- schedule_work(&data->work);
+ schedule_work(&buffer->work);
}
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
unsigned long size;
void *all_buf;
- WARN_ON(atomic_read(&event->mmap_count));
-
- size = sizeof(struct perf_mmap_data);
+ size = sizeof(struct perf_buffer);
size += sizeof(void *);
- data = kzalloc(size, GFP_KERNEL);
- if (!data)
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
goto fail;
- INIT_WORK(&data->work, perf_mmap_data_free_work);
+ INIT_WORK(&buffer->work, perf_buffer_free_work);
all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
if (!all_buf)
goto fail_all_buf;
- data->user_page = all_buf;
- data->data_pages[0] = all_buf + PAGE_SIZE;
- data->data_order = ilog2(nr_pages);
- data->nr_pages = 1;
+ buffer->user_page = all_buf;
+ buffer->data_pages[0] = all_buf + PAGE_SIZE;
+ buffer->page_order = ilog2(nr_pages);
+ buffer->nr_pages = 1;
+
+ perf_buffer_init(buffer, watermark, flags);
- return data;
+ return buffer;
fail_all_buf:
- kfree(data);
+ kfree(buffer);
fail:
return NULL;
@@ -2466,10 +2579,15 @@ fail:
#endif
+static unsigned long perf_data_size(struct perf_buffer *buffer)
+{
+ return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
+}
+
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_event *event = vma->vm_file->private_data;
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
int ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2479,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
rcu_read_lock();
- data = rcu_dereference(event->data);
- if (!data)
+ buffer = rcu_dereference(event->buffer);
+ if (!buffer)
goto unlock;
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
goto unlock;
- vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+ vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
if (!vmf->page)
goto unlock;
@@ -2501,41 +2619,35 @@ unlock:
return ret;
}
-static void
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
{
- long max_size = perf_data_size(data);
-
- atomic_set(&data->lock, -1);
-
- if (event->attr.watermark) {
- data->watermark = min_t(long, max_size,
- event->attr.wakeup_watermark);
- }
-
- if (!data->watermark)
- data->watermark = max_size / 2;
-
+ struct perf_buffer *buffer;
- rcu_assign_pointer(event->data, data);
+ buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+ perf_buffer_free(buffer);
}
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
{
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
- data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
- perf_mmap_data_free(data);
+ rcu_read_lock();
+ buffer = rcu_dereference(event->buffer);
+ if (buffer) {
+ if (!atomic_inc_not_zero(&buffer->refcount))
+ buffer = NULL;
+ }
+ rcu_read_unlock();
+
+ return buffer;
}
-static void perf_mmap_data_release(struct perf_event *event)
+static void perf_buffer_put(struct perf_buffer *buffer)
{
- struct perf_mmap_data *data = event->data;
-
- WARN_ON(atomic_read(&event->mmap_count));
+ if (!atomic_dec_and_test(&buffer->refcount))
+ return;
- rcu_assign_pointer(event->data, NULL);
- call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+ call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
}
static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2549,15 +2661,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- WARN_ON_ONCE(event->ctx->parent_ctx);
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->data);
- struct user_struct *user = current_user();
+ unsigned long size = perf_data_size(event->buffer);
+ struct user_struct *user = event->mmap_user;
+ struct perf_buffer *buffer = event->buffer;
atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->locked_vm -= event->data->nr_locked;
- perf_mmap_data_release(event);
+ vma->vm_mm->locked_vm -= event->mmap_locked;
+ rcu_assign_pointer(event->buffer, NULL);
mutex_unlock(&event->mmap_mutex);
+
+ perf_buffer_put(buffer);
+ free_uid(user);
}
}
@@ -2574,11 +2689,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
unsigned long locked, lock_limit;
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra, extra;
- int ret = 0;
+ int ret = 0, flags = 0;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same buffer.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -2587,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
nr_pages = (vma_size / PAGE_SIZE) - 1;
/*
- * If we have data pages ensure they're a power-of-two number, so we
+ * If we have buffer pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2601,13 +2724,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->mmap_mutex);
- if (event->output) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (atomic_inc_not_zero(&event->mmap_count)) {
- if (nr_pages != event->data->nr_pages)
+ if (event->buffer) {
+ if (event->buffer->nr_pages == nr_pages)
+ atomic_inc(&event->buffer->refcount);
+ else
ret = -EINVAL;
goto unlock;
}
@@ -2636,24 +2756,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
goto unlock;
}
- WARN_ON(event->data);
+ WARN_ON(event->buffer);
- data = perf_mmap_data_alloc(event, nr_pages);
- ret = -ENOMEM;
- if (!data)
- goto unlock;
+ if (vma->vm_flags & VM_WRITE)
+ flags |= PERF_BUFFER_WRITABLE;
- ret = 0;
- perf_mmap_data_init(event, data);
+ buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+ event->cpu, flags);
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ rcu_assign_pointer(event->buffer, buffer);
- atomic_set(&event->mmap_count, 1);
atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->locked_vm += extra;
- event->data->nr_locked = extra;
- if (vma->vm_flags & VM_WRITE)
- event->data->writable = 1;
+ event->mmap_locked = extra;
+ event->mmap_user = get_current_user();
+ vma->vm_mm->locked_vm += event->mmap_locked;
unlock:
+ if (!ret)
+ atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
vma->vm_flags |= VM_RESERVED;
@@ -2823,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return NULL;
}
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
/*
* We assume there is only KVM supporting the callbacks.
@@ -2853,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
/*
* Output
*/
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
unsigned long offset, unsigned long head)
{
unsigned long mask;
- if (!data->writable)
+ if (!buffer->writable)
return true;
- mask = perf_data_size(data) - 1;
+ mask = perf_data_size(buffer) - 1;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -2874,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->data->poll, POLL_IN);
+ atomic_set(&handle->buffer->poll, POLL_IN);
if (handle->nmi) {
handle->event->pending_wakeup = 1;
@@ -2885,128 +3003,88 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
}
/*
- * Curious locking construct.
- *
* We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
* cannot fully serialize things.
*
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
* We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
*/
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
{
- struct perf_mmap_data *data = handle->data;
- int cur, cpu = get_cpu();
-
- handle->locked = 0;
+ struct perf_buffer *buffer = handle->buffer;
- for (;;) {
- cur = atomic_cmpxchg(&data->lock, -1, cpu);
- if (cur == -1) {
- handle->locked = 1;
- break;
- }
- if (cur == cpu)
- break;
-
- cpu_relax();
- }
+ preempt_disable();
+ local_inc(&buffer->nest);
+ handle->wakeup = local_read(&buffer->wakeup);
}
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
{
- struct perf_mmap_data *data = handle->data;
+ struct perf_buffer *buffer = handle->buffer;
unsigned long head;
- int cpu;
-
- data->done_head = data->head;
-
- if (!handle->locked)
- goto out;
again:
- /*
- * The xchg implies a full barrier that ensures all writes are done
- * before we publish the new head, matched by a rmb() in userspace when
- * reading this position.
- */
- while ((head = atomic_long_xchg(&data->done_head, 0)))
- data->user_page->data_head = head;
+ head = local_read(&buffer->head);
/*
- * NMI can happen here, which means we can miss a done_head update.
+ * IRQ/NMI can happen here, which means we can miss a head update.
*/
- cpu = atomic_xchg(&data->lock, -1);
- WARN_ON_ONCE(cpu != smp_processor_id());
+ if (!local_dec_and_test(&buffer->nest))
+ goto out;
/*
- * Therefore we have to validate we did not indeed do so.
+ * Publish the known good head. Rely on the full barrier implied
+ * by atomic_dec_and_test() order the buffer->head read and this
+ * write.
*/
- if (unlikely(atomic_long_read(&data->done_head))) {
- /*
- * Since we had it locked, we can lock it again.
- */
- while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
- cpu_relax();
+ buffer->user_page->data_head = head;
+ /*
+ * Now check if we missed an update, rely on the (compiler)
+ * barrier in atomic_dec_and_test() to re-read buffer->head.
+ */
+ if (unlikely(head != local_read(&buffer->head))) {
+ local_inc(&buffer->nest);
goto again;
}
- if (atomic_xchg(&data->wakeup, 0))
+ if (handle->wakeup != local_read(&buffer->wakeup))
perf_output_wakeup(handle);
-out:
- put_cpu();
+
+ out:
+ preempt_enable();
}
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- unsigned int pages_mask;
- unsigned long offset;
- unsigned int size;
- void **pages;
-
- offset = handle->offset;
- pages_mask = handle->data->nr_pages - 1;
- pages = handle->data->data_pages;
-
do {
- unsigned long page_offset;
- unsigned long page_size;
- int nr;
+ unsigned long size = min_t(unsigned long, handle->size, len);
- nr = (offset >> PAGE_SHIFT) & pages_mask;
- page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
- page_offset = offset & (page_size - 1);
- size = min_t(unsigned int, page_size - page_offset, len);
+ memcpy(handle->addr, buf, size);
- memcpy(pages[nr] + page_offset, buf, size);
+ len -= size;
+ handle->addr += size;
+ buf += size;
+ handle->size -= size;
+ if (!handle->size) {
+ struct perf_buffer *buffer = handle->buffer;
- len -= size;
- buf += size;
- offset += size;
+ handle->page++;
+ handle->page &= buffer->nr_pages - 1;
+ handle->addr = buffer->data_pages[handle->page];
+ handle->size = PAGE_SIZE << page_order(buffer);
+ }
} while (len);
-
- handle->offset = offset;
-
- /*
- * Check we didn't copy past our reservation window, taking the
- * possible unsigned int wrap into account.
- */
- WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
int nmi, int sample)
{
- struct perf_event *output_event;
- struct perf_mmap_data *data;
+ struct perf_buffer *buffer;
unsigned long tail, offset, head;
int have_lost;
struct {
@@ -3022,27 +3100,23 @@ int perf_output_begin(struct perf_output_handle *handle,
if (event->parent)
event = event->parent;
- output_event = rcu_dereference(event->output);
- if (output_event)
- event = output_event;
-
- data = rcu_dereference(event->data);
- if (!data)
+ buffer = rcu_dereference(event->buffer);
+ if (!buffer)
goto out;
- handle->data = data;
+ handle->buffer = buffer;
handle->event = event;
handle->nmi = nmi;
handle->sample = sample;
- if (!data->nr_pages)
- goto fail;
+ if (!buffer->nr_pages)
+ goto out;
- have_lost = atomic_read(&data->lost);
+ have_lost = local_read(&buffer->lost);
if (have_lost)
size += sizeof(lost_event);
- perf_output_lock(handle);
+ perf_output_get_handle(handle);
do {
/*
@@ -3050,26 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle,
* tail pointer. So that all reads will be completed before the
* write is issued.
*/
- tail = ACCESS_ONCE(data->user_page->data_tail);
+ tail = ACCESS_ONCE(buffer->user_page->data_tail);
smp_rmb();
- offset = head = atomic_long_read(&data->head);
+ offset = head = local_read(&buffer->head);
head += size;
- if (unlikely(!perf_output_space(data, tail, offset, head)))
+ if (unlikely(!perf_output_space(buffer, tail, offset, head)))
goto fail;
- } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+ } while (local_cmpxchg(&buffer->head, offset, head) != offset);
- handle->offset = offset;
- handle->head = head;
+ if (head - local_read(&buffer->wakeup) > buffer->watermark)
+ local_add(buffer->watermark, &buffer->wakeup);
- if (head - tail > data->watermark)
- atomic_set(&data->wakeup, 1);
+ handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
+ handle->page &= buffer->nr_pages - 1;
+ handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
+ handle->addr = buffer->data_pages[handle->page];
+ handle->addr += handle->size;
+ handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
- lost_event.lost = atomic_xchg(&data->lost, 0);
+ lost_event.lost = local_xchg(&buffer->lost, 0);
perf_output_put(handle, lost_event);
}
@@ -3077,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
return 0;
fail:
- atomic_inc(&data->lost);
- perf_output_unlock(handle);
+ local_inc(&buffer->lost);
+ perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -3088,19 +3166,19 @@ out:
void perf_output_end(struct perf_output_handle *handle)
{
struct perf_event *event = handle->event;
- struct perf_mmap_data *data = handle->data;
+ struct perf_buffer *buffer = handle->buffer;
int wakeup_events = event->attr.wakeup_events;
if (handle->sample && wakeup_events) {
- int events = atomic_inc_return(&data->events);
+ int events = local_inc_return(&buffer->events);
if (events >= wakeup_events) {
- atomic_sub(wakeup_events, &data->events);
- atomic_set(&data->wakeup, 1);
+ local_sub(wakeup_events, &buffer->events);
+ local_inc(&buffer->wakeup);
}
}
- perf_output_unlock(handle);
+ perf_output_put_handle(handle);
rcu_read_unlock();
}
@@ -3133,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 values[4];
int n = 0;
- values[n++] = atomic64_read(&event->count);
+ values[n++] = perf_event_count(event);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -3170,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (leader != event)
leader->pmu->read(leader);
- values[n++] = atomic64_read(&leader->count);
+ values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
@@ -3182,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (sub != event)
sub->pmu->read(sub);
- values[n++] = atomic64_read(&sub->count);
+ values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
@@ -3413,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
/*
* task tracking -- fork/exit
*
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
*/
struct perf_task_event {
@@ -3436,22 +3514,13 @@ static void perf_event_task_output(struct perf_event *event,
{
struct perf_output_handle handle;
struct task_struct *task = task_event->task;
- unsigned long flags;
int size, ret;
- /*
- * If this CPU attempts to acquire an rq lock held by a CPU spinning
- * in perf_output_lock() from interrupt context, it's game over.
- */
- local_irq_save(flags);
-
size = task_event->event_id.header.size;
ret = perf_output_begin(&handle, event, size, 0, 0);
- if (ret) {
- local_irq_restore(flags);
+ if (ret)
return;
- }
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3531,6 @@ static void perf_event_task_output(struct perf_event *event,
perf_output_put(&handle, task_event->event_id);
perf_output_end(&handle);
- local_irq_restore(flags);
}
static int perf_event_task_match(struct perf_event *event)
@@ -3473,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
- if (event->attr.comm || event->attr.mmap || event->attr.task)
+ if (event->attr.comm || event->attr.mmap ||
+ event->attr.mmap_data || event->attr.task)
return 1;
return 0;
@@ -3698,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
}
static int perf_event_mmap_match(struct perf_event *event,
- struct perf_mmap_event *mmap_event)
+ struct perf_mmap_event *mmap_event,
+ int executable)
{
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
@@ -3706,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
- if (event->attr.mmap)
+ if ((!executable && event->attr.mmap_data) ||
+ (executable && event->attr.mmap))
return 1;
return 0;
}
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
- struct perf_mmap_event *mmap_event)
+ struct perf_mmap_event *mmap_event,
+ int executable)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_mmap_match(event, mmap_event))
+ if (perf_event_mmap_match(event, mmap_event, executable))
perf_event_mmap_output(event, mmap_event);
}
}
@@ -3762,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
if (!vma->vm_mm) {
name = strncpy(tmp, "[vdso]", sizeof(tmp));
goto got_name;
+ } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ vma->vm_end >= vma->vm_mm->brk) {
+ name = strncpy(tmp, "[heap]", sizeof(tmp));
+ goto got_name;
+ } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack) {
+ name = strncpy(tmp, "[stack]", sizeof(tmp));
+ goto got_name;
}
name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3778,17 +3858,17 @@ got_name:
rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);
- perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+ perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
ctx = rcu_dereference(current->perf_event_ctxp);
if (ctx)
- perf_event_mmap_ctx(ctx, mmap_event);
+ perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
put_cpu_var(perf_cpu_context);
rcu_read_unlock();
kfree(buf);
}
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -3950,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
again:
- old = val = atomic64_read(&hwc->period_left);
+ old = val = local64_read(&hwc->period_left);
if (val < 0)
return 0;
nr = div64_u64(period + val, period);
offset = nr * period;
val -= offset;
- if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+ if (local64_cmpxchg(&hwc->period_left, old, val) != old)
goto again;
return nr;
@@ -3990,20 +4070,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
}
}
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
- /*
- * Nothing to do, we already reset hwc->interrupts.
- */
-}
-
static void perf_swevent_add(struct perf_event *event, u64 nr,
int nmi, struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
- atomic64_add(nr, &event->count);
+ local64_add(nr, &event->count);
if (!regs)
return;
@@ -4014,15 +4087,12 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
return perf_swevent_overflow(event, 1, nmi, data, regs);
- if (atomic64_add_negative(nr, &hwc->period_left))
+ if (local64_add_negative(nr, &hwc->period_left))
return;
perf_swevent_overflow(event, 0, nmi, data, regs);
}
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data);
-
static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
{
@@ -4052,10 +4122,6 @@ static int perf_swevent_match(struct perf_event *event,
if (perf_exclude_event(event, regs))
return 0;
- if (event->attr.type == PERF_TYPE_TRACEPOINT &&
- !perf_tp_event_match(event, data))
- return 0;
-
return 1;
}
@@ -4066,19 +4132,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
return hash_64(val, SWEVENT_HLIST_BITS);
}
-static struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
- u64 hash;
- struct swevent_hlist *hlist;
+ u64 hash = swevent_hash(type, event_id);
- hash = swevent_hash(type, event_id);
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
hlist = rcu_dereference(ctx->swevent_hlist);
if (!hlist)
return NULL;
- return &hlist->heads[hash];
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(ctx->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4188,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
rcu_read_lock();
- head = find_swevent_head(cpuctx, type, event_id);
+ head = find_swevent_head_rcu(cpuctx, type, event_id);
if (!head)
goto end;
@@ -4110,7 +4203,7 @@ end:
int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;
if (in_nmi())
@@ -4122,10 +4215,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;
- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (cpuctx->recursion[rctx])
return -1;
- }
cpuctx->recursion[rctx]++;
barrier();
@@ -4134,15 +4225,12 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
barrier();
cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
}
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct pt_regs *regs, u64 addr)
@@ -4150,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct perf_sample_data data;
int rctx;
+ preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (rctx < 0)
return;
@@ -4159,6 +4248,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
}
static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4268,7 @@ static int perf_swevent_enable(struct perf_event *event)
perf_swevent_set_period(event);
}
- head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+ head = find_swevent_head(cpuctx, event);
if (WARN_ON_ONCE(!head))
return -EINVAL;
@@ -4192,11 +4282,22 @@ static void perf_swevent_disable(struct perf_event *event)
hlist_del_rcu(&event->hlist_entry);
}
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+
+static int perf_swevent_int(struct perf_event *event)
+{
+ return 0;
+}
+
static const struct pmu perf_ops_generic = {
.enable = perf_swevent_enable,
.disable = perf_swevent_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
.read = perf_swevent_read,
- .unthrottle = perf_swevent_unthrottle,
+ .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
};
/*
@@ -4277,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
u64 now;
now = cpu_clock(cpu);
- prev = atomic64_xchg(&event->hw.prev_count, now);
- atomic64_add(now - prev, &event->count);
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
}
static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4286,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
int cpu = raw_smp_processor_id();
- atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+ local64_set(&hwc->prev_count, cpu_clock(cpu));
perf_swevent_start_hrtimer(event);
return 0;
@@ -4318,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
u64 prev;
s64 delta;
- prev = atomic64_xchg(&event->hw.prev_count, now);
+ prev = local64_xchg(&event->hw.prev_count, now);
delta = now - prev;
- atomic64_add(delta, &event->count);
+ local64_add(delta, &event->count);
}
static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4330,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
now = event->ctx->time;
- atomic64_set(&hwc->prev_count, now);
+ local64_set(&hwc->prev_count, now);
perf_swevent_start_hrtimer(event);
@@ -4366,6 +4467,14 @@ static const struct pmu perf_ops_task_clock = {
.read = task_clock_perf_event_read,
};
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+ return rcu_dereference_protected(cpuctx->swevent_hlist,
+ lockdep_is_held(&cpuctx->hlist_mutex));
+}
+
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
struct swevent_hlist *hlist;
@@ -4376,12 +4485,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
{
- struct swevent_hlist *hlist;
+ struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
- if (!cpuctx->swevent_hlist)
+ if (!hlist)
return;
- hlist = cpuctx->swevent_hlist;
rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}
@@ -4418,7 +4526,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
mutex_lock(&cpuctx->hlist_mutex);
- if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4575,48 @@ static int swevent_hlist_get(struct perf_event *event)
#ifdef CONFIG_EVENT_TRACING
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- int entry_size, struct pt_regs *regs)
+static const struct pmu perf_ops_tracepoint = {
+ .enable = perf_trace_enable,
+ .disable = perf_trace_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
+ .read = perf_swevent_read,
+ .unthrottle = perf_swevent_void,
+};
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head, int rctx)
{
struct perf_sample_data data;
+ struct perf_event *event;
+ struct hlist_node *node;
+
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
@@ -4479,26 +4625,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
perf_sample_data_init(&data, addr);
data.raw = &raw;
- /* Trace events already protected against recursion */
- do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- &data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- void *record = data->raw->data;
+ hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_add(event, count, 1, &data, regs);
+ }
- if (likely(!event->filter) || filter_match_preds(event->filter, record))
- return 1;
- return 0;
+ perf_swevent_put_recursion_context(rctx);
}
+EXPORT_SYMBOL_GPL(perf_tp_event);
static void tp_perf_event_destroy(struct perf_event *event)
{
- perf_trace_disable(event->attr.config);
- swevent_hlist_put(event);
+ perf_trace_destroy(event);
}
static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4652,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (perf_trace_enable(event->attr.config))
+ err = perf_trace_init(event);
+ if (err)
return NULL;
event->destroy = tp_perf_event_destroy;
- err = swevent_hlist_get(event);
- if (err) {
- perf_trace_disable(event->attr.config);
- return ERR_PTR(err);
- }
- return &perf_ops_generic;
+ return &perf_ops_tracepoint;
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4686,6 @@ static void perf_event_free_filter(struct perf_event *event)
#else
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- return 1;
-}
-
static const struct pmu *tp_perf_event_init(struct perf_event *event)
{
return NULL;
@@ -4746,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr,
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
+ local64_set(&hwc->period_left, hwc->sample_period);
/*
* we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4795,7 +4923,7 @@ done:
if (!event->parent) {
atomic_inc(&nr_events);
- if (event->attr.mmap)
+ if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
@@ -4886,54 +5014,53 @@ err_size:
goto out;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct perf_event *output_event = NULL;
- struct file *output_file = NULL;
- struct perf_event *old_output;
- int fput_needed = 0;
+ struct perf_buffer *buffer = NULL, *old_buffer = NULL;
int ret = -EINVAL;
- if (!output_fd)
+ if (!output_event)
goto set;
- output_file = fget_light(output_fd, &fput_needed);
- if (!output_file)
- return -EBADF;
-
- if (output_file->f_op != &perf_fops)
+ /* don't allow circular references */
+ if (event == output_event)
goto out;
- output_event = output_file->private_data;
-
- /* Don't chain output fds */
- if (output_event->output)
+ /*
+ * Don't allow cross-cpu buffers
+ */
+ if (output_event->cpu != event->cpu)
goto out;
- /* Don't set an output fd when we already have an output channel */
- if (event->data)
+ /*
+ * If its not a per-cpu buffer, it must be the same task.
+ */
+ if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
- atomic_long_inc(&output_file->f_count);
-
set:
mutex_lock(&event->mmap_mutex);
- old_output = event->output;
- rcu_assign_pointer(event->output, output_event);
- mutex_unlock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;
- if (old_output) {
- /*
- * we need to make sure no existing perf_output_*()
- * is still referencing this event.
- */
- synchronize_rcu();
- fput(old_output->filp);
+ if (output_event) {
+ /* get the buffer we want to redirect to */
+ buffer = perf_buffer_get(output_event);
+ if (!buffer)
+ goto unlock;
}
+ old_buffer = event->buffer;
+ rcu_assign_pointer(event->buffer, buffer);
ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+ if (old_buffer)
+ perf_buffer_put(old_buffer);
out:
- fput_light(output_file, fput_needed);
return ret;
}
@@ -4949,13 +5076,13 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
- struct perf_event *event, *group_leader;
+ struct perf_event *event, *group_leader = NULL, *output_event = NULL;
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
struct file *group_file = NULL;
+ int event_fd;
int fput_needed = 0;
- int fput_needed2 = 0;
int err;
/* for future expandability... */
@@ -4976,26 +5103,38 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ event_fd = get_unused_fd_flags(O_RDWR);
+ if (event_fd < 0)
+ return event_fd;
+
/*
* Get the target context (task or percpu):
*/
ctx = find_get_context(pid, cpu);
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_fd;
+ }
+
+ if (group_fd != -1) {
+ group_leader = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_leader)) {
+ err = PTR_ERR(group_leader);
+ goto err_put_context;
+ }
+ group_file = group_leader->filp;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
/*
* Look up the group leader (we will attach this event to it):
*/
- group_leader = NULL;
- if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ if (group_leader) {
err = -EINVAL;
- group_file = fget_light(group_fd, &fput_needed);
- if (!group_file)
- goto err_put_context;
- if (group_file->f_op != &perf_fops)
- goto err_put_context;
- group_leader = group_file->private_data;
/*
* Do not allow a recursive hierarchy (this new sibling
* becoming part of another group-sibling):
@@ -5017,22 +5156,21 @@ SYSCALL_DEFINE5(perf_event_open,
event = perf_event_alloc(&attr, cpu, ctx, group_leader,
NULL, NULL, GFP_KERNEL);
- err = PTR_ERR(event);
- if (IS_ERR(event))
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
goto err_put_context;
+ }
- err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
- if (err < 0)
- goto err_free_put_context;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_free_put_context;
+ }
- event_file = fget_light(err, &fput_needed2);
- if (!event_file)
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
goto err_free_put_context;
-
- if (flags & PERF_FLAG_FD_OUTPUT) {
- err = perf_event_set_output(event, group_fd);
- if (err)
- goto err_fput_free_put_context;
}
event->filp = event_file;
@@ -5048,19 +5186,23 @@ SYSCALL_DEFINE5(perf_event_open,
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
-err_fput_free_put_context:
- fput_light(event_file, fput_needed2);
+ /*
+ * Drop the reference on the group_event after placing the
+ * new event on the sibling_list. This ensures destruction
+ * of the group leader will find the pointer to itself in
+ * perf_group_detach().
+ */
+ fput_light(group_file, fput_needed);
+ fd_install(event_fd, event_file);
+ return event_fd;
err_free_put_context:
- if (err < 0)
- free_event(event);
-
+ free_event(event);
err_put_context:
- if (err < 0)
- put_ctx(ctx);
-
fput_light(group_file, fput_needed);
-
+ put_ctx(ctx);
+err_fd:
+ put_unused_fd(event_fd);
return err;
}
@@ -5166,7 +5308,7 @@ inherit_event(struct perf_event *parent_event,
hwc->sample_period = sample_period;
hwc->last_period = sample_period;
- atomic64_set(&hwc->period_left, sample_period);
+ local64_set(&hwc->period_left, sample_period);
}
child_event->overflow_handler = parent_event->overflow_handler;
@@ -5227,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event,
if (child_event->attr.inherit_stat)
perf_event_read_event(child_event, child);
- child_val = atomic64_read(&child_event->count);
+ child_val = perf_event_count(child_event);
/*
* Add back the child's count to the parent's count:
*/
- atomic64_add(child_val, &parent_event->count);
+ atomic64_add(child_val, &parent_event->child_count);
atomic64_add(child_event->total_time_enabled,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
@@ -5371,6 +5513,7 @@ static void perf_free_event(struct perf_event *event,
fput(parent->filp);
+ perf_group_detach(event);
list_del_event(event, ctx);
free_event(event);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -513,6 +513,13 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
+ /* bump default and minimum pid_max based on number of cpus */
+ pid_max = min(pid_max_max, max_t(int, pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ pid_max_min = max_t(int, pid_max_min,
+ PIDS_PER_CPU_MIN * num_possible_cpus());
+ pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+
init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-struct pm_qos_request_list {
- struct list_head list;
- union {
- s32 value;
- s32 usec;
- s32 kbps;
- };
- int pm_qos_class;
+enum pm_qos_type {
+ PM_QOS_MAX, /* return the largest value */
+ PM_QOS_MIN /* return the smallest value */
};
-static s32 max_compare(s32 v1, s32 v2);
-static s32 min_compare(s32 v1, s32 v2);
-
struct pm_qos_object {
- struct pm_qos_request_list requests;
+ struct plist_head requests;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
s32 default_value;
- atomic_t target_value;
- s32 (*comparitor)(s32, s32);
+ enum pm_qos_type type;
};
+static DEFINE_SPINLOCK(pm_qos_lock);
+
static struct pm_qos_object null_pm_qos;
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_object cpu_dma_pm_qos = {
- .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN,
};
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_object network_lat_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
.notifiers = &network_lat_notifier,
.name = "network_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN
};
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_object network_throughput_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.default_value = 0,
- .target_value = ATOMIC_INIT(0),
- .comparitor = max_compare
+ .type = PM_QOS_MAX,
};
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
&network_throughput_pm_qos
};
-static DEFINE_SPINLOCK(pm_qos_lock);
-
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
.release = pm_qos_power_release,
};
-/* static helper functions */
-static s32 max_compare(s32 v1, s32 v2)
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_object *o)
{
- return max(v1, v2);
-}
+ if (plist_head_empty(&o->requests))
+ return o->default_value;
-static s32 min_compare(s32 v1, s32 v2)
-{
- return min(v1, v2);
-}
+ switch (o->type) {
+ case PM_QOS_MIN:
+ return plist_last(&o->requests)->prio;
+ case PM_QOS_MAX:
+ return plist_first(&o->requests)->prio;
-static void update_target(int pm_qos_class)
+ default:
+ /* runtime check for not using enum */
+ BUG();
+ }
+}
+
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+ int del, int value)
{
- s32 extreme_value;
- struct pm_qos_request_list *node;
unsigned long flags;
- int call_notifier = 0;
+ int prev_value, curr_value;
spin_lock_irqsave(&pm_qos_lock, flags);
- extreme_value = pm_qos_array[pm_qos_class]->default_value;
- list_for_each_entry(node,
- &pm_qos_array[pm_qos_class]->requests.list, list) {
- extreme_value = pm_qos_array[pm_qos_class]->comparitor(
- extreme_value, node->value);
- }
- if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
- extreme_value) {
- call_notifier = 1;
- atomic_set(&pm_qos_array[pm_qos_class]->target_value,
- extreme_value);
- pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
- atomic_read(&pm_qos_array[pm_qos_class]->target_value));
+ prev_value = pm_qos_get_value(o);
+ /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+ if (value != PM_QOS_DEFAULT_VALUE) {
+ /*
+ * to change the list, we atomically remove, reinit
+ * with new value and add, then see if the extremal
+ * changed
+ */
+ plist_del(node, &o->requests);
+ plist_node_init(node, value);
+ plist_add(node, &o->requests);
+ } else if (del) {
+ plist_del(node, &o->requests);
+ } else {
+ plist_add(node, &o->requests);
}
+ curr_value = pm_qos_get_value(o);
spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (call_notifier)
- blocking_notifier_call_chain(
- pm_qos_array[pm_qos_class]->notifiers,
- (unsigned long) extreme_value, NULL);
+ if (prev_value != curr_value)
+ blocking_notifier_call_chain(o->notifiers,
+ (unsigned long)curr_value,
+ NULL);
}
static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor)
*/
int pm_qos_request(int pm_qos_class)
{
- return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+ unsigned long flags;
+ int value;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return value;
}
EXPORT_SYMBOL_GPL(pm_qos_request);
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+ return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
/**
* pm_qos_add_request - inserts new qos request into the list
* @pm_qos_class: identifies which list of qos request to us
@@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
* element as a handle for use in updating and removal. Call needs to save
* this handle for later use.
*/
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+ int pm_qos_class, s32 value)
{
- struct pm_qos_request_list *dep;
- unsigned long flags;
+ struct pm_qos_object *o = pm_qos_array[pm_qos_class];
+ int new_value;
- dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
- if (dep) {
- if (value == PM_QOS_DEFAULT_VALUE)
- dep->value = pm_qos_array[pm_qos_class]->default_value;
- else
- dep->value = value;
- dep->pm_qos_class = pm_qos_class;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_add(&dep->list,
- &pm_qos_array[pm_qos_class]->requests.list);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(pm_qos_class);
+ if (pm_qos_request_active(dep)) {
+ WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+ return;
}
-
- return dep;
+ if (value == PM_QOS_DEFAULT_VALUE)
+ new_value = o->default_value;
+ else
+ new_value = value;
+ plist_node_init(&dep->list, new_value);
+ dep->pm_qos_class = pm_qos_class;
+ update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
@@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
* Attempts are made to make this code callable on hot code paths.
*/
void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
- s32 new_value)
+ s32 new_value)
{
- unsigned long flags;
- int pending_update = 0;
s32 temp;
+ struct pm_qos_object *o;
+
+ if (!pm_qos_req) /*guard against callers passing in null */
+ return;
- if (pm_qos_req) { /*guard against callers passing in null */
- spin_lock_irqsave(&pm_qos_lock, flags);
- if (new_value == PM_QOS_DEFAULT_VALUE)
- temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
- else
- temp = new_value;
-
- if (temp != pm_qos_req->value) {
- pending_update = 1;
- pm_qos_req->value = temp;
- }
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (pending_update)
- update_target(pm_qos_req->pm_qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ return;
}
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+
+ if (new_value == PM_QOS_DEFAULT_VALUE)
+ temp = o->default_value;
+ else
+ temp = new_value;
+
+ if (temp != pm_qos_req->list.prio)
+ update_target(o, &pm_qos_req->list, 0, temp);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
@@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
*/
void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
{
- unsigned long flags;
- int qos_class;
+ struct pm_qos_object *o;
if (pm_qos_req == NULL)
return;
/* silent return to keep pcm code cleaner */
- qos_class = pm_qos_req->pm_qos_class;
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_del(&pm_qos_req->list);
- kfree(pm_qos_req);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ return;
+ }
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+ update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+ memset(pm_qos_req, 0, sizeof(*pm_qos_req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
if (pm_qos_class >= 0) {
- filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
- PM_QOS_DEFAULT_VALUE);
+ struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
+ if (!req)
+ return -ENOMEM;
+
+ pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
if (filp->private_data)
return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
{
struct pm_qos_request_list *req;
- req = (struct pm_qos_request_list *)filp->private_data;
+ req = filp->private_data;
pm_qos_remove_request(req);
+ kfree(req);
return 0;
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 00bb252f29a2..f66bdd33a6c6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
- struct sighand_struct *sighand;
- struct signal_struct *sig;
+ struct signal_struct *sig = tsk->signal;
struct task_struct *t;
- *times = INIT_CPUTIME;
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
rcu_read_lock();
- sighand = rcu_dereference(tsk->sighand);
- if (!sighand)
+ /* make sure we can trust tsk->thread_group list */
+ if (!likely(pid_alive(tsk)))
goto out;
- sig = tsk->signal;
-
t = tsk;
do {
times->utime = cputime_add(times->utime, t->utime);
times->stime = cputime_add(times->stime, t->stime);
times->sum_exec_runtime += t->se.sum_exec_runtime;
-
- t = next_thread(t);
- } while (t != tsk);
-
- times->utime = cputime_add(times->utime, sig->utime);
- times->stime = cputime_add(times->stime, sig->stime);
- times->sum_exec_runtime += sig->sum_sched_runtime;
+ } while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
@@ -363,7 +356,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
}
} else {
read_lock(&tasklist_lock);
- if (thread_group_leader(p) && p->signal) {
+ if (thread_group_leader(p) && p->sighand) {
error =
cpu_clock_sample_group(which_clock,
p, &rtn);
@@ -439,7 +432,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
if (likely(p != NULL)) {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* We raced with the reaping of the task.
* The deletion should have cleared us off the list.
@@ -691,10 +684,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
read_lock(&tasklist_lock);
/*
* We need the tasklist_lock to protect against reaping that
- * clears p->signal. If p has just been reaped, we can no
+ * clears p->sighand. If p has just been reaped, we can no
* longer get any information about it at all.
*/
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
read_unlock(&tasklist_lock);
put_task_struct(p);
timer->it.cpu.task = NULL;
@@ -863,7 +856,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
clear_dead = p->exit_state;
} else {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
@@ -1199,7 +1192,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
spin_lock(&p->sighand->siglock);
} else {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
- /* tsk == current, ensure it is safe to use ->signal/sighand */
- if (unlikely(tsk->exit_state))
- return 0;
-
if (!task_cputime_zero(&tsk->cputime_expires)) {
struct task_cputime task_sample = {
.utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (sig->cputimer.running) {
struct task_cputime group_sample;
- thread_group_cputimer(tsk, &group_sample);
+ spin_lock(&sig->cputimer.lock);
+ group_sample = sig->cputimer.cputime;
+ spin_unlock(&sig->cputimer.lock);
+
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
}
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
{
LIST_HEAD(firing);
struct k_itimer *timer, *next;
+ unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
if (!fastpath_timer_check(tsk))
return;
- spin_lock(&tsk->sighand->siglock);
+ if (!lock_task_sighand(tsk, &flags))
+ return;
/*
* Here we take off tsk->signal->cpu_timers[N] and
* tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* that gets the timer lock before we do will give it up and
* spin until we've taken care of that timer below.
*/
- spin_unlock(&tsk->sighand->siglock);
+ unlock_task_sighand(tsk, &flags);
/*
* Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,19 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->it_overrun = -1;
- error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
- if (error)
- goto out;
- /*
- * return the timer_id now. The next step is hard to
- * back out if there is an error.
- */
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
- error = -EFAULT;
- goto out;
- }
if (timer_event_spec) {
if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
error = -EFAULT;
@@ -597,6 +585,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
+ if (copy_to_user(created_timer_id,
+ &new_timer_id, sizeof (new_timer_id))) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ if (error)
+ goto out;
+
spin_lock_irq(&current->sighand->siglock);
new_timer->it_signal = current->signal;
list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
depends on PM_ADVANCED_DEBUG
default n
+config SUSPEND_NVS
+ bool
+
config SUSPEND
bool "Suspend to RAM and standby"
depends on PM && ARCH_SUSPEND_POSSIBLE
+ select SUSPEND_NVS if HAS_IOMEM
default y
---help---
Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
-config HIBERNATION_NVS
- bool
-
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
- select HIBERNATION_NVS if HAS_IOMEM
+ select SUSPEND_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 524e058dcf06..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
-obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
+obj-$(CONFIG_SUSPEND_NVS) += nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
*
* This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
goto Enable_irqs;
}
- if (hibernation_test(TEST_CORE))
+ if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
goto Power_up;
in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
error);
/* Restore control flow magically appears here */
restore_processor_state();
- if (!in_suspend)
+ if (!in_suspend) {
+ events_check_enabled = false;
platform_leave(platform_mode);
+ }
Power_up:
sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
error = platform_begin(platform_mode);
if (error)
- return error;
+ goto Close;
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
@@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
+ hibernation_freeze_swap();
saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
+ if (!pm_check_wakeup_events()) {
+ error = -EAGAIN;
+ goto Power_up;
+ }
+
hibernation_ops->enter();
/* We should never get here */
while (1);
- /*
- * We don't need to reenable the nonboot CPUs or resume consoles, since
- * the system is going to be halted anyway.
- */
+ Power_up:
+ sysdev_resume();
+ local_irq_enable();
+ enable_nonboot_cpus();
+
Platform_finish:
hibernation_ops->finish();
- dpm_suspend_noirq(PMSG_RESTORE);
+ dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(state);
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up. In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted. Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event. Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event. In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'. It first should read from 'wakeup_count' and store
+ * the read value. Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'. If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned long val;
+
+ return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (sscanf(buf, "%lu", &val) == 1) {
+ if (pm_save_wakeup_count(val))
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
#endif
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
+ &wakeup_count_attr.attr,
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index fdcad9ed5a7b..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -15,7 +15,7 @@
/*
* Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
+ * suspend and to restore the contents of this memory during the subsequent
* resume. The code below implements a mechanism allowing us to do that.
*/
@@ -30,7 +30,7 @@ struct nvs_page {
static LIST_HEAD(nvs_list);
/**
- * hibernate_nvs_register - register platform NVS memory region to save
+ * suspend_nvs_register - register platform NVS memory region to save
* @start - physical address of the region
* @size - size of the region
*
@@ -38,7 +38,7 @@ static LIST_HEAD(nvs_list);
* things so that the data from page-aligned addresses in this region will
* be copied into separate RAM pages.
*/
-int hibernate_nvs_register(unsigned long start, unsigned long size)
+int suspend_nvs_register(unsigned long start, unsigned long size)
{
struct nvs_page *entry, *next;
@@ -68,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
}
/**
- * hibernate_nvs_free - free data pages allocated for saving NVS regions
+ * suspend_nvs_free - free data pages allocated for saving NVS regions
*/
-void hibernate_nvs_free(void)
+void suspend_nvs_free(void)
{
struct nvs_page *entry;
@@ -86,16 +86,16 @@ void hibernate_nvs_free(void)
}
/**
- * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
*/
-int hibernate_nvs_alloc(void)
+int suspend_nvs_alloc(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node) {
entry->data = (void *)__get_free_page(GFP_KERNEL);
if (!entry->data) {
- hibernate_nvs_free();
+ suspend_nvs_free();
return -ENOMEM;
}
}
@@ -103,9 +103,9 @@ int hibernate_nvs_alloc(void)
}
/**
- * hibernate_nvs_save - save NVS memory regions
+ * suspend_nvs_save - save NVS memory regions
*/
-void hibernate_nvs_save(void)
+void suspend_nvs_save(void)
{
struct nvs_page *entry;
@@ -119,12 +119,12 @@ void hibernate_nvs_save(void)
}
/**
- * hibernate_nvs_restore - restore NVS memory regions
+ * suspend_nvs_restore - restore NVS memory regions
*
* This function is going to be called with interrupts disabled, so it
* cannot iounmap the virtual addresses used to access the NVS region.
*/
-void hibernate_nvs_restore(void)
+void suspend_nvs_restore(void)
{
struct nvs_page *entry;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/delay.h>
+#include <linux/workqueue.h>
/*
* Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
+ bool wq_busy = false;
struct timeval start, end;
u64 elapsed_csecs64;
unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
do_gettimeofday(&start);
end_time = jiffies + TIMEOUT;
+
+ if (!sig_only)
+ freeze_workqueues_begin();
+
while (true) {
todo = 0;
read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+
+ if (!sig_only) {
+ wq_busy = freeze_workqueues_busy();
+ todo += wq_busy;
+ }
+
if (!todo || time_after(jiffies, end_time))
break;
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
*/
printk("\n");
printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
- "(%d tasks refusing to freeze):\n",
- elapsed_csecs / 100, elapsed_csecs % 100, todo);
+ "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ elapsed_csecs / 100, elapsed_csecs % 100,
+ todo - wq_busy, wq_busy);
+
+ thaw_workqueues();
+
read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
oom_killer_enable();
printk("Restarting tasks ... ");
+ thaw_workqueues();
thaw_tasks(true);
thaw_tasks(false);
schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
*
* This file provides system snapshot/restore functionality for swsusp.
*
- * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernation_thaw_swap();
}
/* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 56e7dbb8b996..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -16,6 +16,12 @@
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
#include "power.h"
@@ -130,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
- return error;
+ goto Platform_finish;
}
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
- goto Platfrom_finish;
+ goto Platform_finish;
}
if (suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
- goto Power_up_devices;
+ goto Platform_wake;
}
if (suspend_test(TEST_PLATFORM))
@@ -157,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
- if (!suspend_test(TEST_CORE))
+ if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
error = suspend_ops->enter(state);
+ events_check_enabled = false;
+ }
sysdev_resume();
}
@@ -172,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->wake)
suspend_ops->wake();
- Power_up_devices:
dpm_resume_noirq(PMSG_RESUME);
- Platfrom_finish:
+ Platform_finish:
if (suspend_ops->finish)
suspend_ops->finish();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
* This file provides functions for reading the suspend image from
* and writing it to a swap partition.
*
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
@@ -32,7 +32,7 @@
/*
* The swap map is a data structure used for keeping track of each page
* written to a swap partition. It consists of many swap_map_page
- * structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
* These structures are stored on the swap and linked together with the
* help of the .next_swap member.
*
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
- offset = swp_offset(get_swap_page_of_type(swap));
+ offset = swp_offset(get_swap_for_hibernation(swap));
if (offset) {
if (swsusp_extents_insert(offset))
- swap_free(swp_entry(swap, offset));
+ swap_free_for_hibernation(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
}
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
/**
* free_all_swap_pages - free swap pages allocated for saving image data.
- * It also frees the extents used to register which swap entres had been
+ * It also frees the extents used to register which swap entries had been
* allocated.
*/
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
ext = container_of(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
- swap_free(swp_entry(swap, offset));
+ swap_free_for_hibernation(swp_entry(swap, offset));
kfree(ext);
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..8fe465ac008a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,9 +33,12 @@
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
+#include <linux/kdb.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/uaccess.h>
@@ -413,6 +416,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
}
+#ifdef CONFIG_KGDB_KDB
+/* kdb dmesg command needs access to the syslog buffer. do_syslog()
+ * uses locks so it cannot be used during debugging. Just tell kdb
+ * where the start and end of the physical and logical logs are. This
+ * is equivalent to do_syslog(3).
+ */
+void kdb_syslog_data(char *syslog_data[4])
+{
+ syslog_data[0] = log_buf;
+ syslog_data[1] = log_buf + log_buf_len;
+ syslog_data[2] = log_buf + log_end -
+ (logged_chars < log_buf_len ? logged_chars : log_buf_len);
+ syslog_data[3] = log_buf + log_end;
+}
+#endif /* CONFIG_KGDB_KDB */
+
/*
* Call the console drivers on a range of log_buf
*/
@@ -586,6 +605,14 @@ asmlinkage int printk(const char *fmt, ...)
va_list args;
int r;
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ va_start(args, fmt);
+ r = vkdb_printf(fmt, args);
+ va_end(args);
+ return r;
+ }
+#endif
va_start(args, fmt);
r = vprintk(fmt, args);
va_end(args);
@@ -960,6 +987,32 @@ void resume_console(void)
}
/**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console. This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int __cpuinit console_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ case CPU_DYING:
+ case CPU_DOWN_FAILED:
+ case CPU_UP_CANCELED:
+ acquire_console_sem();
+ release_console_sem();
+ }
+ return NOTIFY_OK;
+}
+
+/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
@@ -1346,7 +1399,7 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
-static int __init disable_boot_consoles(void)
+static int __init printk_late_init(void)
{
struct console *con;
@@ -1357,9 +1410,10 @@ static int __init disable_boot_consoles(void)
unregister_console(con);
}
}
+ hotcpu_notifier(console_cpu_notify, 0);
return 0;
}
-late_initcall(disable_boot_consoles);
+late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
@@ -1495,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
chars = logged_chars;
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (logged_chars > end) {
- s1 = log_buf + log_buf_len - logged_chars + end;
- l1 = logged_chars - end;
+ if (chars > end) {
+ s1 = log_buf + log_buf_len - chars + end;
+ l1 = chars - end;
s2 = log_buf;
l2 = end;
@@ -1505,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
s1 = "";
l1 = 0;
- s2 = log_buf + end - logged_chars;
- l2 = logged_chars;
+ s2 = log_buf + end - chars;
+ l2 = chars;
}
if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/profile.c b/kernel/profile.c
index dfadc5b729f1..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_node(cpu);
+ node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -388,7 +388,7 @@ out_free:
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
if (prof_cpu_mask != NULL)
@@ -567,7 +567,7 @@ static int create_hash_tables(void)
int cpu;
for_each_online_cpu(cpu) {
- int node = cpu_to_node(cpu);
+ int node = cpu_to_mem(cpu);
struct page *page;
page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6af9cdd558b7..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -594,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_detach(child, data);
break;
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+ case PTRACE_GETFDPIC: {
+ struct mm_struct *mm = get_task_mm(child);
+ unsigned long tmp = 0;
+
+ ret = -ESRCH;
+ if (!mm)
+ break;
+
+ switch (addr) {
+ case PTRACE_GETFDPIC_EXEC:
+ tmp = mm->context.exec_fdpic_loadmap;
+ break;
+ case PTRACE_GETFDPIC_INTERP:
+ tmp = mm->context.interp_fdpic_loadmap;
+ break;
+ default:
+ break;
+ }
+ mmput(mm);
+
+ ret = put_user(tmp, (unsigned long __user *) data);
+ break;
+ }
+#endif
+
#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
#endif
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
#include <linux/range.h>
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
}
EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
#endif /* #ifdef CONFIG_PROVE_RCU */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+ debug_object_init(head, &rcuhead_debug_descr);
+}
+
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_init(head, &rcuhead_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. We just make sure that it is
+ * tracked in the object tracker.
+ */
+ debug_object_init(head, &rcuhead_debug_descr);
+ debug_object_activate(head, &rcuhead_debug_descr);
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_activate(head, &rcuhead_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+#ifndef CONFIG_PREEMPT
+ WARN_ON(1);
+ return 0;
+#else
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_free(head, &rcuhead_debug_descr);
+ return 1;
+#endif
+ default:
+ return 0;
+ }
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack. This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap. This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope. As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap. Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+ .name = "rcu_head",
+ .fixup_init = rcuhead_fixup_init,
+ .fixup_activate = rcuhead_fixup_activate,
+ .fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
while (list) {
next = list->next;
prefetch(next);
+ debug_rcu_head_unqueue(list);
list->func(list);
list = next;
}
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
{
unsigned long flags;
+ debug_rcu_head_queue(head);
head->func = func;
head->next = NULL;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
rcu_random(struct rcu_random_state *rrsp)
{
if (--rrsp->rrs_count < 0) {
- rrsp->rrs_state +=
- (unsigned long)cpu_clock(raw_smp_processor_id());
+ rrsp->rrs_state += (unsigned long)local_clock();
rrsp->rrs_count = RCU_RANDOM_REFRESH;
}
rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
while (list) {
next = list->next;
prefetch(next);
+ debug_rcu_head_unqueue(list);
list->func(list);
list = next;
if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
+ debug_rcu_head_queue(head);
head->func = func;
head->next = NULL;
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
"relay_hotcpu_callback: cpu %d buffer "
"creation failed\n", hotcpu);
mutex_unlock(&relay_channels_mutex);
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
}
}
mutex_unlock(&relay_channels_mutex);
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
size_t read_subbuf = read_start / subbuf_size;
size_t padding = rbuf->padding[read_subbuf];
size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
return 0;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
/*
* Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+ nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
}
}
+ ret = 0;
if (!spd.nr_pages)
- return 0;
+ goto out;
ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
if (ret < 0 || ret < total_len)
- return ret;
+ goto out;
if (read_start + ret == nonpad_end)
ret += padding;
+out:
+ splice_shrink_spd(pipe, &spd);
return ret;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c358e263534..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
@@ -681,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
* release_region releases a matching busy region.
*/
+static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+
/**
* __request_region - create a new busy resource region
* @parent: parent resource descriptor
@@ -693,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name, int flags)
{
+ DECLARE_WAITQUEUE(wait, current);
struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
if (!res)
@@ -717,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
if (!(conflict->flags & IORESOURCE_BUSY))
continue;
}
-
+ if (conflict->flags & flags & IORESOURCE_MUXED) {
+ add_wait_queue(&muxed_resource_wait, &wait);
+ write_unlock(&resource_lock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&muxed_resource_wait, &wait);
+ write_lock(&resource_lock);
+ continue;
+ }
/* Uhhuh, that didn't work out.. */
kfree(res);
res = NULL;
@@ -791,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
break;
*p = res->sibling;
write_unlock(&resource_lock);
+ if (res->flags & IORESOURCE_MUXED)
+ wake_up(&muxed_resource_wait);
kfree(res);
return;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index d9c0368eeb21..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#include "workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -306,52 +307,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
*/
struct task_group init_task_group;
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
- struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
- struct task_group, css);
-#else
- tg = &init_task_group;
-#endif
- return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
- /*
- * Strictly speaking this rcu_read_lock() is not needed since the
- * task_group is tied to the cgroup, which in turn can never go away
- * as long as there are tasks attached to it.
- *
- * However since task_group() uses task_subsys_state() which is an
- * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
- */
- rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
- p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
- p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
- p->rt.rt_rq = task_group(p)->rt_rq[cpu];
- p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
- rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
- return NULL;
-}
-
#endif /* CONFIG_CGROUP_SCHED */
/* CFS-related fields in a runqueue */
@@ -502,9 +457,10 @@ struct rq {
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
- unsigned char in_nohz_recently;
+ unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
@@ -544,6 +500,8 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned long cpu_power;
+
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
@@ -642,6 +600,49 @@ static inline int cpu_of(struct rq *rq)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct cgroup_subsys_state *css;
+
+ css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+ lockdep_is_held(&task_rq(p)->lock));
+ return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+ p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = task_group(p)->rt_rq[cpu];
+ p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
inline void update_rq_clock(struct rq *rq)
{
if (!rq->skip_clock_update)
@@ -969,14 +970,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
-void task_rq_unlock_wait(struct task_struct *p)
-{
- struct rq *rq = task_rq(p);
-
- smp_mb(); /* spin-unlock-wait is not a full memory barrier */
- raw_spin_unlock_wait(&rq->lock);
-}
-
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1202,6 +1195,27 @@ static void resched_cpu(int cpu)
#ifdef CONFIG_NO_HZ
/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int cpu = smp_processor_id();
+ int i;
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd))
+ if (!idle_cpu(i))
+ return i;
+ }
+ return cpu;
+}
+/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
* which is scheduled to wake up that CPU. In case of a completely
@@ -1241,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
-int nohz_ratelimit(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 diff = rq->clock - rq->nohz_stamp;
-
- rq->nohz_stamp = rq->clock;
-
- return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1263,6 +1267,12 @@ static void sched_avg_update(struct rq *rq)
s64 period = sched_avg_period();
while ((s64)(rq->clock - rq->age_stamp) > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
@@ -1507,24 +1517,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static struct sched_group *group_of(int cpu)
-{
- struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
-
- if (!sd)
- return NULL;
-
- return sd->groups;
-}
-
static unsigned long power_of(int cpu)
{
- struct sched_group *group = group_of(cpu);
-
- if (!group)
- return SCHED_LOAD_SCALE;
-
- return group->cpu_power;
+ return cpu_rq(cpu)->cpu_power;
}
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1670,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
if (root_task_group_empty())
return;
- now = cpu_clock(raw_smp_processor_id());
+ now = local_clock();
elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1681,9 +1676,6 @@ static void update_shares(struct sched_domain *sd)
static void update_h_load(long cpu)
{
- if (root_task_group_empty())
- return;
-
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
@@ -1826,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -1862,8 +1855,8 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
- p->se.load.weight = prio_to_weight[0] * 2;
- p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ p->se.load.weight = 0;
+ p->se.load.inv_weight = WMULT_CONST;
return;
}
@@ -2288,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ bool is_sync, bool is_migrate, bool is_local,
+ unsigned long en_flags)
+{
+ schedstat_inc(p, se.statistics.nr_wakeups);
+ if (is_sync)
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
+ if (is_migrate)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ if (is_local)
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+
+ activate_task(rq, p, en_flags);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ int wake_flags, bool success)
+{
+ trace_sched_wakeup(p, success);
+ check_preempt_curr(rq, p, wake_flags);
+
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken)
+ p->sched_class->task_woken(rq, p);
+
+ if (unlikely(rq->idle_stamp)) {
+ u64 delta = rq->clock - rq->idle_stamp;
+ u64 max = 2*sysctl_sched_migration_cost;
+
+ if (delta > max)
+ rq->avg_idle = max;
+ else
+ update_avg(&rq->avg_idle, delta);
+ rq->idle_stamp = 0;
+ }
+#endif
+ /* if a worker is waking up, notify workqueue */
+ if ((p->flags & PF_WQ_WORKER) && success)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/**
* try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
* @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2300,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state,
int wake_flags)
@@ -2380,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.statistics.nr_wakeups);
- if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (orig_cpu != cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (cpu == this_cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
- activate_task(rq, p, en_flags);
+ ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+ cpu == this_cpu, en_flags);
success = 1;
-
out_running:
- trace_sched_wakeup(p, success);
- check_preempt_curr(rq, p, wake_flags);
-
- p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
- p->sched_class->task_woken(rq, p);
-
- if (unlikely(rq->idle_stamp)) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
-
- if (delta > max)
- rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
- rq->idle_stamp = 0;
- }
-#endif
+ ttwu_post_activation(p, rq, wake_flags, success);
out:
task_rq_unlock(rq, &flags);
put_cpu();
@@ -2420,6 +2431,37 @@ out:
}
/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task. this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ bool success = false;
+
+ BUG_ON(rq != this_rq());
+ BUG_ON(p == current);
+ lockdep_assert_held(&rq->lock);
+
+ if (!(p->state & TASK_NORMAL))
+ return;
+
+ if (!p->se.on_rq) {
+ if (likely(!task_running(rq, p))) {
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(rq, ttwu_local);
+ }
+ ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+ success = true;
+ }
+ ttwu_post_activation(p, rq, 0, success);
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2515,7 +2557,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ */
+ rcu_read_lock();
set_task_cpu(p, cpu);
+ rcu_read_unlock();
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -2885,9 +2936,9 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
{
- struct rq *this = this_rq();
+ struct rq *this = cpu_rq(cpu);
return atomic_read(&this->nr_iowait);
}
@@ -3024,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
*/
static void update_cpu_load(struct rq *this_rq)
{
unsigned long this_load = this_rq->load.weight;
+ unsigned long curr_jiffies = jiffies;
+ unsigned long pending_updates;
int i, scale;
this_rq->nr_load_updates++;
+ /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
/* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -3048,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
* example.
*/
if (new_load > old_load)
- new_load += scale-1;
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+ update_cpu_load(this_rq);
calc_load_account_active(this_rq);
}
@@ -3438,7 +3574,7 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load(rq);
+ update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
@@ -3610,7 +3746,6 @@ need_resched:
rq = cpu_rq(cpu);
rcu_note_context_switch(cpu);
prev = rq->curr;
- switch_count = &prev->nivcsw;
release_kernel_lock(prev);
need_resched_nonpreemptible:
@@ -3623,11 +3758,26 @@ need_resched_nonpreemptible:
raw_spin_lock_irq(&rq->lock);
clear_tsk_need_resched(prev);
+ switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely(signal_pending_state(prev->state, prev)))
+ if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
- else
+ } else {
+ /*
+ * If a worker is going to sleep, notify and
+ * ask workqueue whether it wants to wake up a
+ * task to maintain concurrency. If so, wake
+ * up the task.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev, cpu);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup);
+ }
deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ }
switch_count = &prev->nvcsw;
}
@@ -3649,8 +3799,10 @@ need_resched_nonpreemptible:
context_switch(rq, prev, next); /* unlocks the rq */
/*
- * the context switch might have flipped the stack from under
- * us, hence refresh the local variables.
+ * The context switch have flipped the stack from under us
+ * and restored the local variables which were saved when
+ * this task called schedule() in the past. prev == current
+ * is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3659,11 +3811,8 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(current) < 0)) {
- prev = rq->curr;
- switch_count = &prev->nivcsw;
+ if (unlikely(reacquire_kernel_lock(prev)))
goto need_resched_nonpreemptible;
- }
preempt_enable_no_resched();
if (need_resched())
@@ -3738,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched preempt_schedule(void)
+asmlinkage void __sched notrace preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
@@ -3750,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void)
return;
do {
- add_preempt_count(PREEMPT_ACTIVE);
+ add_preempt_count_notrace(PREEMPT_ACTIVE);
schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -4062,6 +4211,23 @@ int __sched wait_for_completion_killable(struct completion *x)
EXPORT_SYMBOL(wait_for_completion_killable);
/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+unsigned long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
@@ -4436,12 +4602,8 @@ recheck:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (rt_policy(policy)) {
- unsigned long rlim_rtprio;
-
- if (!lock_task_sighand(p, &flags))
- return -ESRCH;
- rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- unlock_task_sighand(p, &flags);
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
/* can't set/change the rt policy */
if (policy != p->policy && !rlim_rtprio)
@@ -4469,16 +4631,6 @@ recheck:
}
if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0)
- return -EPERM;
-#endif
-
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
@@ -4494,6 +4646,22 @@ recheck:
* runqueue lock must be held.
*/
rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (user) {
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EPERM;
+ }
+ }
+#endif
+
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
@@ -5805,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
- .priority = 10
+ .priority = CPU_PRI_MIGRATION,
};
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ set_cpu_active((long)hcpu, true);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ set_cpu_active((long)hcpu, false);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
- /* Start one for the boot CPU: */
+ /* Initialize migration for the boot CPU */
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+ /* Register cpu active notifiers */
+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
return 0;
}
early_initcall(migration_init);
@@ -6053,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
free_rootdomain(old_rd);
}
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
{
- gfp_t gfp = GFP_KERNEL;
-
memset(rd, 0, sizeof(*rd));
- if (bootmem)
- gfp = GFP_NOWAIT;
-
- if (!alloc_cpumask_var(&rd->span, gfp))
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&rd->online, gfp))
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_online;
- if (cpupri_init(&rd->cpupri, bootmem) != 0)
+ if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
return 0;
@@ -6085,7 +6277,7 @@ out:
static void init_defrootdomain(void)
{
- init_rootdomain(&def_root_domain, true);
+ init_rootdomain(&def_root_domain);
atomic_set(&def_root_domain.refcount, 1);
}
@@ -6098,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
if (!rd)
return NULL;
- if (init_rootdomain(rd, false) != 0) {
+ if (init_rootdomain(rd) != 0) {
kfree(rd);
return NULL;
}
@@ -7277,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-#ifndef CONFIG_CPUSETS
/*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
*/
-static int update_sched_domains(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- partition_sched_domains(1, NULL, NULL);
+ cpuset_update_active_cpus();
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus();
+ return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
-#endif
static int update_runtime(struct notifier_block *nfb,
unsigned long action, void *hcpu)
@@ -7345,10 +7543,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
-#ifndef CONFIG_CPUSETS
- /* XXX: Theoretical race here - CPU may be hotplugged now */
- hotcpu_notifier(update_sched_domains, 0);
-#endif
+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
/* RT runtime code needs to handle some hotplug events */
hotcpu_notifier(update_runtime, 0);
@@ -7593,9 +7789,13 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
+
+ rq->last_load_update_tick = jiffies;
+
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
+ rq->cpu_power = SCHED_LOAD_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7605,6 +7805,10 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+ rq->nohz_balance_kick = 0;
+ init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -7649,8 +7853,11 @@ void __init sched_init(void)
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+ atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
@@ -7759,9 +7966,9 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#ifdef CONFIG_IA64
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling.
+ * These functions are only useful for the IA64 MCA handling, or kdb.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -7781,6 +7988,9 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
/**
* set_curr_task - set the current task for a given cpu.
* @cpu: the processor in question.
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
*
- * Create a semi stable clock from a mixture of other events, including:
- * - gtod
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i) -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock() -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ * - GTOD (clock monotomic)
* - sched_clock()
* - explicit idle events
*
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 2 jiffies difference).
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
@@ -41,6 +77,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
}
+EXPORT_SYMBOL_GPL(sched_clock);
static __read_mostly int sched_clock_running;
@@ -169,6 +206,11 @@ again:
return val;
}
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
@@ -236,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
{
- unsigned long long clock;
+ u64 clock;
unsigned long flags;
local_irq_save(flags);
@@ -248,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
return clock;
}
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+ u64 clock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ clock = sched_clock_cpu(smp_processor_id());
+ local_irq_restore(flags);
+
+ return clock;
+}
+
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
void sched_clock_init(void)
@@ -263,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-
-unsigned long long cpu_clock(int cpu)
+u64 cpu_clock(int cpu)
{
return sched_clock_cpu(cpu);
}
+u64 local_clock(void)
+{
+ return sched_clock_cpu(0);
+}
+
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*
* Returns: -ENOMEM if memory fails.
*/
-int cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp)
{
- gfp_t gfp = GFP_KERNEL;
int i;
- if (bootmem)
- gfp = GFP_NOWAIT;
-
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
raw_spin_lock_init(&vec->lock);
vec->count = 0;
- if (!zalloc_cpumask_var(&vec->mask, gfp))
+ if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
goto cleanup;
}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
int cpupri_find(struct cpupri *cp,
struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp, bool bootmem);
+int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
#else
#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87a330a7185f..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
- PN(sysctl_sched_child_runs_first);
+ P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
#undef P
@@ -381,15 +381,9 @@ __initcall(init_sched_debug_procfs);
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- unsigned long flags;
- int num_threads = 1;
-
- if (lock_task_sighand(p, &flags)) {
- num_threads = atomic_read(&p->signal->count);
- unlock_task_sighand(p, &flags);
- }
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------\n");
#define __P(F) \
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e4..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
- unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
@@ -1241,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* effect of the currently running task from the load
* of the current CPU:
*/
+ rcu_read_lock();
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
@@ -1252,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
tg = task_group(p);
weight = p->se.load.weight;
- imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
- imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+ if (this_load) {
+ unsigned long this_eff_load, prev_eff_load;
+
+ this_eff_load = 100;
+ this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+ balanced = this_eff_load <= prev_eff_load;
+ } else
+ balanced = true;
+ rcu_read_unlock();
/*
* If the currently running task will sleep within
@@ -2276,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
- else
- power *= default_scale_freq_power(sd, cpu);
-
- power >>= SCHED_LOAD_SHIFT;
-
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
@@ -2292,12 +2296,22 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_LOAD_SHIFT;
}
+ sdg->cpu_power_orig = power;
+
+ if (sched_feat(ARCH_POWER))
+ power *= arch_scale_freq_power(sd, cpu);
+ else
+ power *= default_scale_freq_power(sd, cpu);
+
+ power >>= SCHED_LOAD_SHIFT;
+
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
if (!power)
power = 1;
+ cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
@@ -2323,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
sdg->cpu_power = power;
}
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+ /*
+ * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ */
+ if (sd->level != SD_LV_SIBLING)
+ return 0;
+
+ /*
+ * If ~90% of the cpu_power is still there, we're good.
+ */
+ if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ return 1;
+
+ return 0;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
@@ -2388,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu) {
- *balance = 0;
- return;
+ if (idle != CPU_NEWLY_IDLE && local_group) {
+ if (balance_cpu != this_cpu) {
+ *balance = 0;
+ return;
+ }
+ update_group_power(sd, this_cpu);
}
- update_group_power(sd, this_cpu);
-
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
@@ -2416,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(sd, group);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs,
+ int this_cpu)
+{
+ if (sgs->avg_load <= sds->max_load)
+ return false;
+
+ if (sgs->sum_nr_running > sgs->group_capacity)
+ return true;
+
+ if (sgs->group_imb)
+ return true;
+
+ /*
+ * ASYM_PACKING needs to move all the work to the lowest
+ * numbered CPUs in the group, therefore mark all groups
+ * higher than ourself as busy.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ this_cpu < group_first_cpu(sg)) {
+ if (!sds->busiest)
+ return true;
+
+ if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ return true;
+ }
+
+ return false;
}
/**
@@ -2423,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
@@ -2434,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
- struct sched_group *group = sd->groups;
+ struct sched_group *sg = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
@@ -2447,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += group->cpu_power;
+ sds->total_pwr += sg->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the group capacity to one so that we'll try
+ * first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away.
*/
if (prefer_sibling)
@@ -2469,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
if (local_group) {
sds->this_load = sgs.avg_load;
- sds->this = group;
+ sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
- } else if (sgs.avg_load > sds->max_load &&
- (sgs.sum_nr_running > sgs.group_capacity ||
- sgs.group_imb)) {
+ } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
- sds->busiest = group;
+ sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(group, sds, local_group, &sgs);
- group = group->next;
- } while (group != sd->groups);
+ update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+ sg = sg->next;
+ } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched doman.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ int busiest_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = group_first_cpu(sds->busiest);
+ if (this_cpu > busiest_cpu)
+ return 0;
+
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ SCHED_LOAD_SCALE);
+ return 1;
}
/**
@@ -2680,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!(*balance))
goto ret;
+ if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(sd, &sds, this_cpu, imbalance))
+ return sds.busiest;
+
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
@@ -2714,8 +2850,9 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+ enum cpu_idle_type idle, unsigned long imbalance,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -2726,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl;
+ if (!capacity)
+ capacity = fix_small_capacity(sd, group);
+
if (!cpumask_test_cpu(i, cpus))
continue;
@@ -2765,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+ int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * higher numbered CPUs in order to pack all tasks in the
+ * lowest numbered CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ return 1;
+
/*
* The only task running in a non-idle cpu can be moved to this
* cpu in an attempt to completely freeup the other CPU
@@ -2842,7 +2992,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2886,7 +3036,8 @@ redo:
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle)) {
+ if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+ this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -3081,13 +3232,40 @@ out_unlock:
}
#ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+ csd->func = trigger_sched_softirq;
+ csd->info = NULL;
+ csd->flags = 0;
+ csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ * entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ * it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ */
static struct {
atomic_t load_balancer;
- cpumask_var_t cpu_mask;
- cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
- .load_balancer = ATOMIC_INIT(-1),
-};
+ atomic_t first_pick_cpu;
+ atomic_t second_pick_cpu;
+ cpumask_var_t idle_cpus_mask;
+ cpumask_var_t grp_idle_mask;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
int get_nohz_load_balancer(void)
{
@@ -3141,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
*/
static inline int is_semi_idle_group(struct sched_group *ilb_group)
{
- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
sched_group_cpus(ilb_group));
/*
* A sched_group is semi-idle when it has atleast one busy cpu
* and atleast one idle cpu.
*/
- if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ if (cpumask_empty(nohz.grp_idle_mask))
return 0;
- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
return 0;
return 1;
@@ -3184,7 +3362,7 @@ static int find_new_ilb(int cpu)
* Optimize for the case when we have no idle CPUs or only one
* idle CPU. Don't walk the sched_domain hierarchy in such cases
*/
- if (cpumask_weight(nohz.cpu_mask) < 2)
+ if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3192,7 +3370,7 @@ static int find_new_ilb(int cpu)
do {
if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.ilb_grp_nohz_mask);
+ return cpumask_first(nohz.grp_idle_mask);
ilb_group = ilb_group->next;
@@ -3200,98 +3378,116 @@ static int find_new_ilb(int cpu)
}
out_done:
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
{
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#endif
/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+ int ilb_cpu;
+
+ nohz.next_balance++;
+
+ ilb_cpu = get_nohz_load_balancer();
+
+ if (ilb_cpu >= nr_cpu_ids) {
+ ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+ }
+
+ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+ struct call_single_data *cp;
+
+ cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ __smp_call_function_single(ilb_cpu, cp, 0);
+ }
+ return;
+}
+
+/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
*
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
*
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
*/
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_rq(cpu)->in_nohz_recently = 1;
-
if (!cpu_active(cpu)) {
if (atomic_read(&nohz.load_balancer) != cpu)
- return 0;
+ return;
/*
* If we are going offline and still the leader,
* give up!
*/
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
- return 0;
+ return;
}
- cpumask_set_cpu(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- /* time for ilb owner also to sleep */
- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- if (atomic_read(&nohz.load_balancer) == cpu)
- atomic_set(&nohz.load_balancer, -1);
- return 0;
- }
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.load_balancer) == -1) {
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
int new_ilb;
- if (!(sched_smt_power_savings ||
- sched_mc_power_savings))
- return 1;
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ cpu) != nr_cpu_ids)
+ return;
+
/*
* Check to see if there is a more power-efficient
* ilb.
*/
new_ilb = find_new_ilb(cpu);
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, -1);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
resched_cpu(new_ilb);
- return 0;
+ return;
}
- return 1;
+ return;
}
} else {
- if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- return 0;
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return;
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
}
- return 0;
+ return;
}
#endif
@@ -3373,11 +3569,102 @@ out:
rq->next_balance = next_balance;
}
+#ifdef CONFIG_NO_HZ
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *rq;
+ int balance_cpu;
+
+ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ return;
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ this_rq->nohz_balance_kick = 0;
+ break;
+ }
+
+ raw_spin_lock_irq(&this_rq->lock);
+ update_rq_clock(this_rq);
+ update_cpu_load(this_rq);
+ raw_spin_unlock_irq(&this_rq->lock);
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ nohz.next_balance = this_rq->next_balance;
+ this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * idle load balancer when it has more than one process active. This
+ * eliminates the need for idle load balancing altogether when we have
+ * only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ * to run for active_load_balance to happen (i.e., two busy CPUs are
+ * SMT or core siblings and can run better if they move to different
+ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ * which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+ unsigned long now = jiffies;
+ int ret;
+ int first_pick_cpu, second_pick_cpu;
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (!rq->nr_running)
+ return 0;
+
+ first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+ second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ return 0;
+
+ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ if (rq->nr_running > 1)
+ return 1;
+ } else {
+ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ if (rq->nr_running)
+ return 1;
+ }
+ }
+ return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
@@ -3387,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
/*
- * If this cpu is the owner for idle load balancing, then do the
+ * If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- if (this_rq->idle_at_tick &&
- atomic_read(&nohz.load_balancer) == this_cpu) {
- struct rq *rq;
- int balance_cpu;
-
- for_each_cpu(balance_cpu, nohz.cpu_mask) {
- if (balance_cpu == this_cpu)
- continue;
-
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
-
- rebalance_domains(balance_cpu, CPU_IDLE);
-
- rq = cpu_rq(balance_cpu);
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
- }
- }
-#endif
+ nohz_idle_balance(this_cpu, idle);
}
static inline int on_null_domain(int cpu)
@@ -3427,57 +3689,17 @@ static inline int on_null_domain(int cpu)
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
-#ifdef CONFIG_NO_HZ
- /*
- * If we were in the nohz mode recently and busy at the current
- * scheduler tick, then check if we need to nominate new idle
- * load balancer.
- */
- if (rq->in_nohz_recently && !rq->idle_at_tick) {
- rq->in_nohz_recently = 0;
-
- if (atomic_read(&nohz.load_balancer) == cpu) {
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
- atomic_set(&nohz.load_balancer, -1);
- }
-
- if (atomic_read(&nohz.load_balancer) == -1) {
- int ilb = find_new_ilb(cpu);
-
- if (ilb < nr_cpu_ids)
- resched_cpu(ilb);
- }
- }
-
- /*
- * If this cpu is idle and doing idle load balancing for all the
- * cpus with ticks stopped, is it time for that to stop?
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- resched_cpu(cpu);
- return;
- }
-
- /*
- * If this cpu is idle and the idle load balancing is done by
- * someone else, then no need raise the SCHED_SOFTIRQ
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpumask_test_cpu(cpu, nohz.cpu_mask))
- return;
-#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ nohz_balancer_kick(cpu);
+#endif
}
static void rq_online_fair(struct rq *rq)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
- if (!p->signal)
- return;
-
/* max may change after cur was read, this will be fixed next tick */
soft = task_rlimit(p, RLIMIT_RTTIME);
hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
static inline void account_group_user_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct thread_group_cputimer *cputimer;
-
- /* tsk == current, ensure it is safe to use ->signal */
- if (unlikely(tsk->exit_state))
- return;
-
- cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
static inline void account_group_system_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct thread_group_cputimer *cputimer;
-
- /* tsk == current, ensure it is safe to use ->signal */
- if (unlikely(tsk->exit_state))
- return;
-
- cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
- struct thread_group_cputimer *cputimer;
- struct signal_struct *sig;
-
- sig = tsk->signal;
- /* see __exit_signal()->task_rq_unlock_wait() */
- barrier();
- if (unlikely(!sig))
- return;
-
- cputimer = &sig->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
- const struct cred *cred = current_cred(), *tcred;
+ const struct cred *cred, *tcred;
struct pid *sid;
int error;
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
if (error)
return error;
+ cred = current_cred();
tcred = __task_cred(t);
- if ((cred->euid ^ tcred->suid) &&
+ if (!same_thread_group(current, t) &&
+ (cred->euid ^ tcred->suid) &&
(cred->euid ^ tcred->uid) &&
(cred->uid ^ tcred->suid) &&
(cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
/*
* Nuke all other threads in the group.
*/
-void zap_other_threads(struct task_struct *p)
+int zap_other_threads(struct task_struct *p)
{
- struct task_struct *t;
+ struct task_struct *t = p;
+ int count = 0;
p->signal->group_stop_count = 0;
- for (t = next_thread(p); t != p; t = next_thread(t)) {
- /*
- * Don't bother with already dead threads
- */
+ while_each_thread(p, t) {
+ count++;
+
+ /* Don't bother with already dead threads */
if (t->exit_state)
continue;
-
- /* SIGKILL will be handled before any pending SIGSTOP */
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
}
+
+ return count;
}
struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
/*
* send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- int ret = check_kill_permission(sig, info, p);
+ int ret;
+
+ rcu_read_lock();
+ ret = check_kill_permission(sig, info, p);
+ rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
@@ -2735,3 +2741,43 @@ void __init signals_init(void)
{
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
+
+#ifdef CONFIG_KGDB_KDB
+#include <linux/kdb.h>
+/*
+ * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * signal internals. This function checks if the required locks are
+ * available before calling the main signal code, to avoid kdb
+ * deadlocks.
+ */
+void
+kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+{
+ static struct task_struct *kdb_prev_t;
+ int sig, new_t;
+ if (!spin_trylock(&t->sighand->siglock)) {
+ kdb_printf("Can't do kill command now.\n"
+ "The sigmask lock is held somewhere else in "
+ "kernel, try again later\n");
+ return;
+ }
+ spin_unlock(&t->sighand->siglock);
+ new_t = kdb_prev_t != t;
+ kdb_prev_t = t;
+ if (t->state != TASK_RUNNING && new_t) {
+ kdb_printf("Process is not RUNNING, sending a signal from "
+ "kdb risks deadlock\n"
+ "on the run queue locks. "
+ "The signal has _not_ been sent.\n"
+ "Reissue the kill command if you want to risk "
+ "the deadlock.\n");
+ return;
+ }
+ sig = info->si_signo;
+ if (send_sig_info(sig, info, t))
+ kdb_printf("Fail to deliver Signal %d to process %d.\n",
+ sig, t->pid);
+ else
+ kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
+}
+#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
- seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
- struct timespec now, diff;
-
- now = CURRENT_TIME;
- diff = timespec_sub(now, work->mark);
-
- if (diff.tv_sec < 0)
- seq_puts(m, " -ve ");
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
- seq_printf(m, "%3luns ", diff.tv_nsec);
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
- seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
- seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
- else if (diff.tv_sec <= 1)
- seq_puts(m, " 1s ");
- else if (diff.tv_sec < 60)
- seq_printf(m, "%4lus ", diff.tv_sec);
- else if (diff.tv_sec < 60 * 60)
- seq_printf(m, "%4lum ", diff.tv_sec / 60);
- else if (diff.tv_sec < 60 * 60 * 24)
- seq_printf(m, "%4luh ", diff.tv_sec / 3600);
- else
- seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
- struct slow_work *work;
- struct list_head *p = v;
- unsigned long id;
-
- switch ((unsigned long) v) {
- case 1:
- seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
- return 0;
- case 2:
- seq_puts(m, "=== ===== ================ == ===== ==========\n");
- return 0;
-
- case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
- id = (unsigned long) v - 3;
-
- read_lock(&slow_work_execs_lock);
- work = slow_work_execs[id];
- if (work) {
- smp_read_barrier_depends();
-
- seq_printf(m, "%3lu %5d %16p %2lx ",
- id, slow_work_pids[id], work, work->flags);
- slow_work_print_mark(m, work);
-
- if (work->ops->desc)
- work->ops->desc(work, m);
- seq_putc(m, '\n');
- }
- read_unlock(&slow_work_execs_lock);
- return 0;
-
- default:
- work = list_entry(p, struct slow_work, link);
- seq_printf(m, "%3s - %16p %2lx ",
- work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
- work, work->flags);
- slow_work_print_mark(m, work);
-
- if (work->ops->desc)
- work->ops->desc(work, m);
- seq_putc(m, '\n');
- return 0;
- }
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
- struct list_head *p;
- unsigned long count, id;
-
- switch (*_pos >> ITERATOR_SHIFT) {
- case 0x0:
- if (*_pos == 0)
- *_pos = 1;
- if (*_pos < 3)
- return (void *)(unsigned long) *_pos;
- if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
- for (id = *_pos - 3;
- id < SLOW_WORK_THREAD_LIMIT;
- id++, (*_pos)++)
- if (slow_work_execs[id])
- return (void *)(unsigned long) *_pos;
- *_pos = 0x1UL << ITERATOR_SHIFT;
-
- case 0x1:
- count = *_pos & ITERATOR_COUNTER;
- list_for_each(p, &slow_work_queue) {
- if (count == 0)
- return p;
- count--;
- }
- *_pos = 0x2UL << ITERATOR_SHIFT;
-
- case 0x2:
- count = *_pos & ITERATOR_COUNTER;
- list_for_each(p, &vslow_work_queue) {
- if (count == 0)
- return p;
- count--;
- }
- *_pos = 0x3UL << ITERATOR_SHIFT;
-
- default:
- return NULL;
- }
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
- spin_lock_irq(&slow_work_queue_lock);
- return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
- struct list_head *p = v;
- unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
- (*_pos)++;
- switch (selector) {
- case 0x0:
- return slow_work_runqueue_index(m, _pos);
-
- case 0x1:
- if (*_pos >> ITERATOR_SHIFT == 0x1) {
- p = p->next;
- if (p != &slow_work_queue)
- return p;
- }
- *_pos = 0x2UL << ITERATOR_SHIFT;
- p = &vslow_work_queue;
-
- case 0x2:
- if (*_pos >> ITERATOR_SHIFT == 0x2) {
- p = p->next;
- if (p != &vslow_work_queue)
- return p;
- }
- *_pos = 0x3UL << ITERATOR_SHIFT;
-
- default:
- return NULL;
- }
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
- spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
- .start = slow_work_runqueue_start,
- .stop = slow_work_runqueue_stop,
- .next = slow_work_runqueue_next,
- .show = slow_work_runqueue_show,
-};
-
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
- .owner = THIS_MODULE,
- .open = slow_work_runqueue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-#endif
-
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
- * very slow work */
-
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-
-ctl_table slow_work_sysctls[] = {
- {
- .procname = "min-threads",
- .data = &slow_work_min_threads,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = slow_work_min_threads_sysctl,
- .extra1 = (void *) &slow_work_min_min_threads,
- .extra2 = &slow_work_max_threads,
- },
- {
- .procname = "max-threads",
- .data = &slow_work_max_threads,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = slow_work_max_threads_sysctl,
- .extra1 = &slow_work_min_threads,
- .extra2 = (void *) &slow_work_max_max_threads,
- },
- {
- .procname = "vslow-percentage",
- .data = &vslow_work_proportion,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *) &slow_work_min_vslow,
- .extra2 = (void *) &slow_work_max_vslow,
- },
- {}
-};
-#endif
-
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
- if (work)
- slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
- struct module *module = slow_work_thread_processing[id];
-
- slow_work_thread_processing[id] = NULL;
- smp_mb();
- if (slow_work_unreg_work_item == work ||
- slow_work_unreg_module == module)
- wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
- slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-
-/*
- * The queues of work items and the lock governing access to them. These are
- * shared between all the CPUs. It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue. These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-
-/*
- * The thread controls. A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-
-/*
- * The number of users of the thread pool and its lock. Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-
-static inline int slow_work_get_ref(struct slow_work *work)
-{
- if (work->ops->get_ref)
- return work->ops->get_ref(work);
-
- return 0;
-}
-
-static inline void slow_work_put_ref(struct slow_work *work)
-{
- if (work->ops->put_ref)
- work->ops->put_ref(work);
-}
-
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool. This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
- unsigned vsmax;
-
- vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
- vsmax /= 100;
- vsmax = max(vsmax, 1U);
- return min(vsmax, slow_work_max_threads - 1);
-}
-
-/*
- * Attempt to execute stuff queued on a slow thread. Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
- struct slow_work *work = NULL;
- unsigned vsmax;
- bool very_slow;
-
- vsmax = slow_work_calc_vsmax();
-
- /* see if we can schedule a new thread to be started if we're not
- * keeping up with the work */
- if (!waitqueue_active(&slow_work_thread_wq) &&
- (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
- atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
- !slow_work_may_not_start_new_thread)
- slow_work_enqueue(&slow_work_new_thread);
-
- /* find something to execute */
- spin_lock_irq(&slow_work_queue_lock);
- if (!list_empty(&vslow_work_queue) &&
- atomic_read(&vslow_work_executing_count) < vsmax) {
- work = list_entry(vslow_work_queue.next,
- struct slow_work, link);
- if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
- BUG();
- list_del_init(&work->link);
- atomic_inc(&vslow_work_executing_count);
- very_slow = true;
- } else if (!list_empty(&slow_work_queue)) {
- work = list_entry(slow_work_queue.next,
- struct slow_work, link);
- if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
- BUG();
- list_del_init(&work->link);
- very_slow = false;
- } else {
- very_slow = false; /* avoid the compiler warning */
- }
-
- slow_work_set_thread_processing(id, work);
- if (work) {
- slow_work_mark_time(work);
- slow_work_begin_exec(id, work);
- }
-
- spin_unlock_irq(&slow_work_queue_lock);
-
- if (!work)
- return false;
-
- if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
- BUG();
-
- /* don't execute if the work is in the process of being cancelled */
- if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
- work->ops->execute(work);
-
- if (very_slow)
- atomic_dec(&vslow_work_executing_count);
- clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-
- /* wake up anyone waiting for this work to be complete */
- wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-
- slow_work_end_exec(id, work);
-
- /* if someone tried to enqueue the item whilst we were executing it,
- * then it'll be left unenqueued to avoid multiple threads trying to
- * execute it simultaneously
- *
- * there is, however, a race between us testing the pending flag and
- * getting the spinlock, and between the enqueuer setting the pending
- * flag and getting the spinlock, so we use a deferral bit to tell us
- * if the enqueuer got there first
- */
- if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
- spin_lock_irq(&slow_work_queue_lock);
-
- if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
- test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
- goto auto_requeue;
-
- spin_unlock_irq(&slow_work_queue_lock);
- }
-
- /* sort out the race between module unloading and put_ref() */
- slow_work_put_ref(work);
- slow_work_done_thread_processing(id, work);
-
- return true;
-
-auto_requeue:
- /* we must complete the enqueue operation
- * - we transfer our ref on the item back to the appropriate queue
- * - don't wake another thread up as we're awake already
- */
- slow_work_mark_time(work);
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
- list_add_tail(&work->link, &vslow_work_queue);
- else
- list_add_tail(&work->link, &slow_work_queue);
- spin_unlock_irq(&slow_work_queue_lock);
- slow_work_clear_thread_processing(id);
- return true;
-}
-
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
- signed long *_timeout)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
-
- DEFINE_WAIT(wait);
-
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- if (!list_empty(queue))
- return true;
-
- add_wait_queue_exclusive(wfo_wq, &wait);
- if (list_empty(queue))
- *_timeout = schedule_timeout(*_timeout);
- finish_wait(wfo_wq, &wait);
-
- return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing. If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations. The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention. The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute. This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
- unsigned long flags;
- int ret;
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- return -ECANCELED;
-
- BUG_ON(slow_work_user_count <= 0);
- BUG_ON(!work);
- BUG_ON(!work->ops);
-
- /* when honouring an enqueue request, we only promise that we will run
- * the work function in the future; we do not promise to run it once
- * per enqueue request
- *
- * we use the PENDING bit to merge together repeat requests without
- * having to disable IRQs and take the spinlock, whilst still
- * maintaining our promise
- */
- if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
- goto cancelled;
-
- /* we promise that we will not attempt to execute the work
- * function in more than one thread simultaneously
- *
- * this, however, leaves us with a problem if we're asked to
- * enqueue the work whilst someone is executing the work
- * function as simply queueing the work immediately means that
- * another thread may try executing it whilst it is already
- * under execution
- *
- * to deal with this, we set the ENQ_DEFERRED bit instead of
- * enqueueing, and the thread currently executing the work
- * function will enqueue the work item when the work function
- * returns and it has cleared the EXECUTING bit
- */
- if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
- set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
- } else {
- ret = slow_work_get_ref(work);
- if (ret < 0)
- goto failed;
- slow_work_mark_time(work);
- list_add_tail(&work->link, queue);
- wake_up(&slow_work_thread_wq);
-
- /* if someone who could be requeued is sleeping on a
- * thread, then ask them to yield their thread */
- if (work->link.prev == queue)
- wake_up(wfo_wq);
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- }
- return 0;
-
-cancelled:
- ret = -ECANCELED;
-failed:
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-
-static int slow_work_wait(void *word)
-{
- schedule();
- return 0;
-}
-
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
- bool wait = true, put = false;
-
- set_bit(SLOW_WORK_CANCELLING, &work->flags);
- smp_mb();
-
- /* if the work item is a delayed work item with an active timer, we
- * need to wait for the timer to finish _before_ getting the spinlock,
- * lest we deadlock against the timer routine
- *
- * the timer routine will leave DELAYED set if it notices the
- * CANCELLING flag in time
- */
- if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
- struct delayed_slow_work *dwork =
- container_of(work, struct delayed_slow_work, work);
- del_timer_sync(&dwork->timer);
- }
-
- spin_lock_irq(&slow_work_queue_lock);
-
- if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
- /* the timer routine aborted or never happened, so we are left
- * holding the timer's reference on the item and should just
- * drop the pending flag and wait for any ongoing execution to
- * finish */
- struct delayed_slow_work *dwork =
- container_of(work, struct delayed_slow_work, work);
-
- BUG_ON(timer_pending(&dwork->timer));
- BUG_ON(!list_empty(&work->link));
-
- clear_bit(SLOW_WORK_DELAYED, &work->flags);
- put = true;
- clear_bit(SLOW_WORK_PENDING, &work->flags);
-
- } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
- !list_empty(&work->link)) {
- /* the link in the pending queue holds a reference on the item
- * that we will need to release */
- list_del_init(&work->link);
- wait = false;
- put = true;
- clear_bit(SLOW_WORK_PENDING, &work->flags);
-
- } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
- /* the executor is holding our only reference on the item, so
- * we merely need to wait for it to finish executing */
- clear_bit(SLOW_WORK_PENDING, &work->flags);
- }
-
- spin_unlock_irq(&slow_work_queue_lock);
-
- /* the EXECUTING flag is set by the executor whilst the spinlock is set
- * and before the item is dequeued - so assuming the above doesn't
- * actually dequeue it, simply waiting for the EXECUTING flag to be
- * released here should be sufficient */
- if (wait)
- wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
- TASK_UNINTERRUPTIBLE);
-
- clear_bit(SLOW_WORK_CANCELLING, &work->flags);
- if (put)
- slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
- struct slow_work *work = (struct slow_work *) data;
- unsigned long flags;
- bool queued = false, put = false, first = false;
-
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
- if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
- clear_bit(SLOW_WORK_DELAYED, &work->flags);
-
- if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
- /* we discard the reference the timer was holding in
- * favour of the one the executor holds */
- set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
- put = true;
- } else {
- slow_work_mark_time(work);
- list_add_tail(&work->link, queue);
- queued = true;
- if (work->link.prev == queue)
- first = true;
- }
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- if (put)
- slow_work_put_ref(work);
- if (first)
- wake_up(wfo_wq);
- if (queued)
- wake_up(&slow_work_thread_wq);
-}
-
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed. The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
- unsigned long delay)
-{
- struct slow_work *work = &dwork->work;
- unsigned long flags;
- int ret;
-
- if (delay == 0)
- return slow_work_enqueue(&dwork->work);
-
- BUG_ON(slow_work_user_count <= 0);
- BUG_ON(!work);
- BUG_ON(!work->ops);
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- return -ECANCELED;
-
- if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- goto cancelled;
-
- /* the timer holds a reference whilst it is pending */
- ret = slow_work_get_ref(work);
- if (ret < 0)
- goto cant_get_ref;
-
- if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
- BUG();
- dwork->timer.expires = jiffies + delay;
- dwork->timer.data = (unsigned long) work;
- dwork->timer.function = delayed_slow_work_timer;
- add_timer(&dwork->timer);
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- }
-
- return 0;
-
-cancelled:
- ret = -ECANCELED;
-cant_get_ref:
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
- mod_timer(&slow_work_cull_timer,
- round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
- unsigned long flags;
- bool do_cull = false;
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (slow_work_cull) {
- slow_work_cull = false;
-
- if (list_empty(&slow_work_queue) &&
- list_empty(&vslow_work_queue) &&
- atomic_read(&slow_work_thread_count) >
- slow_work_min_threads) {
- slow_work_schedule_cull();
- do_cull = true;
- }
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return do_cull;
-}
-
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
- return !list_empty(&slow_work_queue) ||
- (!list_empty(&vslow_work_queue) &&
- atomic_read(&vslow_work_executing_count) < vsmax);
-}
-
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
- int vsmax, id;
-
- DEFINE_WAIT(wait);
-
- set_freezable();
- set_user_nice(current, -5);
-
- /* allocate ourselves an ID */
- spin_lock_irq(&slow_work_queue_lock);
- id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
- BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
- __set_bit(id, slow_work_ids);
- slow_work_set_thread_pid(id, current->pid);
- spin_unlock_irq(&slow_work_queue_lock);
-
- sprintf(current->comm, "kslowd%03u", id);
-
- for (;;) {
- vsmax = vslow_work_proportion;
- vsmax *= atomic_read(&slow_work_thread_count);
- vsmax /= 100;
-
- prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
- TASK_INTERRUPTIBLE);
- if (!freezing(current) &&
- !slow_work_threads_should_exit &&
- !slow_work_available(vsmax) &&
- !slow_work_cull)
- schedule();
- finish_wait(&slow_work_thread_wq, &wait);
-
- try_to_freeze();
-
- vsmax = vslow_work_proportion;
- vsmax *= atomic_read(&slow_work_thread_count);
- vsmax /= 100;
-
- if (slow_work_available(vsmax) && slow_work_execute(id)) {
- cond_resched();
- if (list_empty(&slow_work_queue) &&
- list_empty(&vslow_work_queue) &&
- atomic_read(&slow_work_thread_count) >
- slow_work_min_threads)
- slow_work_schedule_cull();
- continue;
- }
-
- if (slow_work_threads_should_exit)
- break;
-
- if (slow_work_cull && slow_work_cull_thread())
- break;
- }
-
- spin_lock_irq(&slow_work_queue_lock);
- slow_work_set_thread_pid(id, 0);
- __clear_bit(id, slow_work_ids);
- spin_unlock_irq(&slow_work_queue_lock);
-
- if (atomic_dec_and_test(&slow_work_thread_count))
- complete_and_exit(&slow_work_last_thread_exited, 0);
- return 0;
-}
-
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
- slow_work_cull = true;
- wake_up(&slow_work_thread_wq);
-}
-
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
- struct task_struct *p;
-
- if (slow_work_threads_should_exit)
- return;
-
- if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
- return;
-
- if (!mutex_trylock(&slow_work_user_lock))
- return;
-
- slow_work_may_not_start_new_thread = true;
- atomic_inc(&slow_work_thread_count);
- p = kthread_run(slow_work_thread, NULL, "kslowd");
- if (IS_ERR(p)) {
- printk(KERN_DEBUG "Slow work thread pool: OOM\n");
- if (atomic_dec_and_test(&slow_work_thread_count))
- BUG(); /* we're running on a slow work thread... */
- mod_timer(&slow_work_oom_timer,
- round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
- } else {
- /* ratelimit the starting of new threads */
- mod_timer(&slow_work_oom_timer, jiffies + 1);
- }
-
- mutex_unlock(&slow_work_user_lock);
-}
-
-static const struct slow_work_ops slow_work_new_thread_ops = {
- .owner = THIS_MODULE,
- .execute = slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
- .desc = slow_work_new_thread_desc,
-#endif
-};
-
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
- slow_work_may_not_start_new_thread = false;
-}
-
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int n;
-
- if (ret == 0) {
- mutex_lock(&slow_work_user_lock);
- if (slow_work_user_count > 0) {
- /* see if we need to start or stop threads */
- n = atomic_read(&slow_work_thread_count) -
- slow_work_min_threads;
-
- if (n < 0 && !slow_work_may_not_start_new_thread)
- slow_work_enqueue(&slow_work_new_thread);
- else if (n > 0)
- slow_work_schedule_cull();
- }
- mutex_unlock(&slow_work_user_lock);
- }
-
- return ret;
-}
-
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int n;
-
- if (ret == 0) {
- mutex_lock(&slow_work_user_lock);
- if (slow_work_user_count > 0) {
- /* see if we need to stop threads */
- n = slow_work_max_threads -
- atomic_read(&slow_work_thread_count);
-
- if (n < 0)
- slow_work_schedule_cull();
- }
- mutex_unlock(&slow_work_user_lock);
- }
-
- return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point. This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
- struct task_struct *p;
- int loop;
-
- mutex_lock(&slow_work_user_lock);
-
- if (slow_work_user_count == 0) {
- printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
- init_completion(&slow_work_last_thread_exited);
-
- slow_work_threads_should_exit = false;
- slow_work_init(&slow_work_new_thread,
- &slow_work_new_thread_ops);
- slow_work_may_not_start_new_thread = false;
- slow_work_cull = false;
-
- /* start the minimum number of threads */
- for (loop = 0; loop < slow_work_min_threads; loop++) {
- atomic_inc(&slow_work_thread_count);
- p = kthread_run(slow_work_thread, NULL, "kslowd");
- if (IS_ERR(p))
- goto error;
- }
- printk(KERN_NOTICE "Slow work thread pool: Ready\n");
- }
-
- slow_work_user_count++;
- mutex_unlock(&slow_work_user_lock);
- return 0;
-
-error:
- if (atomic_dec_and_test(&slow_work_thread_count))
- complete(&slow_work_last_thread_exited);
- if (loop > 0) {
- printk(KERN_ERR "Slow work thread pool:"
- " Aborting startup on ENOMEM\n");
- slow_work_threads_should_exit = true;
- wake_up_all(&slow_work_thread_wq);
- wait_for_completion(&slow_work_last_thread_exited);
- printk(KERN_ERR "Slow work thread pool: Aborted\n");
- }
- mutex_unlock(&slow_work_user_lock);
- return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
- DECLARE_WAITQUEUE(myself, current);
- struct slow_work *work;
- int loop;
-
- mutex_lock(&slow_work_unreg_sync_lock);
- add_wait_queue(&slow_work_unreg_wq, &myself);
-
- for (;;) {
- spin_lock_irq(&slow_work_queue_lock);
-
- /* first of all, we wait for the last queued item in each list
- * to be processed */
- list_for_each_entry_reverse(work, &vslow_work_queue, link) {
- if (work->owner == module) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- slow_work_unreg_work_item = work;
- goto do_wait;
- }
- }
- list_for_each_entry_reverse(work, &slow_work_queue, link) {
- if (work->owner == module) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- slow_work_unreg_work_item = work;
- goto do_wait;
- }
- }
-
- /* then we wait for the items being processed to finish */
- slow_work_unreg_module = module;
- smp_mb();
- for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
- if (slow_work_thread_processing[loop] == module)
- goto do_wait;
- }
- spin_unlock_irq(&slow_work_queue_lock);
- break; /* okay, we're done */
-
- do_wait:
- spin_unlock_irq(&slow_work_queue_lock);
- schedule();
- slow_work_unreg_work_item = NULL;
- slow_work_unreg_module = NULL;
- }
-
- remove_wait_queue(&slow_work_unreg_wq, &myself);
- mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
- /* first of all, wait for all outstanding items from the calling module
- * to complete */
- if (module)
- slow_work_wait_for_items(module);
-
- /* then we can actually go about shutting down the facility if need
- * be */
- mutex_lock(&slow_work_user_lock);
-
- BUG_ON(slow_work_user_count <= 0);
-
- slow_work_user_count--;
- if (slow_work_user_count == 0) {
- printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
- slow_work_threads_should_exit = true;
- del_timer_sync(&slow_work_cull_timer);
- del_timer_sync(&slow_work_oom_timer);
- wake_up_all(&slow_work_thread_wq);
- wait_for_completion(&slow_work_last_thread_exited);
- printk(KERN_NOTICE "Slow work thread pool:"
- " Shut down complete\n");
- }
-
- mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
- unsigned nr_cpus = num_possible_cpus();
-
- if (slow_work_max_threads < nr_cpus)
- slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
- if (slow_work_max_max_threads < nr_cpus * 2)
- slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
- {
- struct dentry *dbdir;
-
- dbdir = debugfs_create_dir("slow_work", NULL);
- if (dbdir && !IS_ERR(dbdir))
- debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
- NULL, &slow_work_runqueue_fops);
- }
-#endif
- return 0;
-}
-
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
- * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
- * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
-
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- slow_work_pids[id] = pid;
-#endif
-}
-
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- work->mark = CURRENT_TIME;
-#endif
-}
-
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- slow_work_execs[id] = work;
-#endif
-}
-
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- write_lock(&slow_work_execs_lock);
- slow_work_execs[id] = NULL;
- write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/smp.c b/kernel/smp.c
index 3fc697336183..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE_FROZEN:
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
break;
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0db913a5c60f..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
- return NOTIFY_BAD;
+ return notifier_from_errno(PTR_ERR(p));
}
kthread_bind(p, hotcpu);
per_cpu(ksoftirqd, hotcpu) = p;
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
void *cpu = (void *)(long)smp_processor_id();
int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
+ BUG_ON(err != NOTIFY_OK);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Detect Soft Lockups
- *
- * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
- *
- * this code detects soft lockups: incidents in where on a CPU
- * the kernel does not reschedule for 10 seconds or more.
- */
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-
-#include <asm/irq_regs.h>
-
-static DEFINE_SPINLOCK(print_lock);
-
-static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
-static DEFINE_PER_CPU(bool, softlock_touch_sync);
-
-static int __read_mostly did_panic;
-int __read_mostly softlockup_thresh = 60;
-
-/*
- * Should we panic (and reboot, if panic_timeout= is set) when a
- * soft-lockup occurs:
- */
-unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
-static int __init softlockup_panic_setup(char *str)
-{
- softlockup_panic = simple_strtoul(str, NULL, 0);
-
- return 1;
-}
-__setup("softlockup_panic=", softlockup_panic_setup);
-
-static int
-softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
- did_panic = 1;
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block panic_block = {
- .notifier_call = softlock_panic,
-};
-
-/*
- * Returns seconds, approximately. We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(int this_cpu)
-{
- return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
-}
-
-static void __touch_softlockup_watchdog(void)
-{
- int this_cpu = raw_smp_processor_id();
-
- __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
-}
-
-void touch_softlockup_watchdog(void)
-{
- __raw_get_cpu_var(softlockup_touch_ts) = 0;
-}
-EXPORT_SYMBOL(touch_softlockup_watchdog);
-
-void touch_softlockup_watchdog_sync(void)
-{
- __raw_get_cpu_var(softlock_touch_sync) = true;
- __raw_get_cpu_var(softlockup_touch_ts) = 0;
-}
-
-void touch_all_softlockup_watchdogs(void)
-{
- int cpu;
-
- /* Cause each CPU to re-update its timestamp rather than complain */
- for_each_online_cpu(cpu)
- per_cpu(softlockup_touch_ts, cpu) = 0;
-}
-EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
-
-int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- touch_all_softlockup_watchdogs();
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-
-/*
- * This callback runs from the timer interrupt, and checks
- * whether the watchdog thread has hung or not:
- */
-void softlockup_tick(void)
-{
- int this_cpu = smp_processor_id();
- unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
- unsigned long print_ts;
- struct pt_regs *regs = get_irq_regs();
- unsigned long now;
-
- /* Is detection switched off? */
- if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
- /* Be sure we don't false trigger if switched back on */
- if (touch_ts)
- per_cpu(softlockup_touch_ts, this_cpu) = 0;
- return;
- }
-
- if (touch_ts == 0) {
- if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
- /*
- * If the time stamp was touched atomically
- * make sure the scheduler tick is up to date.
- */
- per_cpu(softlock_touch_sync, this_cpu) = false;
- sched_clock_tick();
- }
- __touch_softlockup_watchdog();
- return;
- }
-
- print_ts = per_cpu(softlockup_print_ts, this_cpu);
-
- /* report at most once a second */
- if (print_ts == touch_ts || did_panic)
- return;
-
- /* do not print during early bootup: */
- if (unlikely(system_state != SYSTEM_RUNNING)) {
- __touch_softlockup_watchdog();
- return;
- }
-
- now = get_timestamp(this_cpu);
-
- /*
- * Wake up the high-prio watchdog task twice per
- * threshold timespan.
- */
- if (time_after(now - softlockup_thresh/2, touch_ts))
- wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
-
- /* Warn about unreasonable delays: */
- if (time_before_eq(now - softlockup_thresh, touch_ts))
- return;
-
- per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
-
- spin_lock(&print_lock);
- printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
- this_cpu, now - touch_ts,
- current->comm, task_pid_nr(current));
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
- spin_unlock(&print_lock);
-
- if (softlockup_panic)
- panic("softlockup: hung tasks");
-}
-
-/*
- * The watchdog thread - runs every second and touches the timestamp.
- */
-static int watchdog(void *__bind_cpu)
-{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
- sched_setscheduler(current, SCHED_FIFO, &param);
-
- /* initialize timestamp */
- __touch_softlockup_watchdog();
-
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 60 seconds then the
- * debug-printout triggers in softlockup_tick().
- */
- while (!kthread_should_stop()) {
- __touch_softlockup_watchdog();
- schedule();
-
- if (kthread_should_stop())
- break;
-
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
-
- return 0;
-}
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
- p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
- return NOTIFY_BAD;
- }
- per_cpu(softlockup_touch_ts, hotcpu) = 0;
- per_cpu(softlockup_watchdog, hotcpu) = p;
- kthread_bind(p, hotcpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!per_cpu(softlockup_watchdog, hotcpu))
- break;
- /* Unbind so it can run. Fall thru. */
- kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
- cpumask_any(cpu_online_mask));
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- p = per_cpu(softlockup_watchdog, hotcpu);
- per_cpu(softlockup_watchdog, hotcpu) = NULL;
- kthread_stop(p);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
- nosoftlockup = 1;
- return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static int __init spawn_softlockup_task(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- if (nosoftlockup)
- return 0;
-
- err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- if (err == NOTIFY_BAD) {
- BUG();
- return 1;
- }
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
- register_cpu_notifier(&cpu_nfb);
-
- atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-
- return 0;
-}
-early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c78..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
spinlock_t lock;
+ bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
struct task_struct *thread; /* stopper thread */
- bool enabled; /* is this stopper enabled? */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
- case CPU_DEAD:
+ case CPU_POST_DEAD:
{
struct cpu_stop_work *work;
diff --git a/kernel/sys.c b/kernel/sys.c
index 0d36d889c74d..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1632,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(char **argv, char **envp)
+static void argv_cleanup(struct subprocess_info *info)
{
- argv_free(argv);
+ argv_free(info->argv);
}
/**
@@ -1668,7 +1668,7 @@ int orderly_poweroff(bool force)
goto out;
}
- call_usermodehelper_setcleanup(info, argv_cleanup);
+ call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b12583047757..ca38e8e3e907 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,21 +37,24 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/ratelimit.h>
+#include <linux/compaction.h>
#include <linux/hugetlb.h>
#include <linux/initrd.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/dnotify.h>
#include <linux/syscalls.h>
#include <linux/vmstat.h>
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
-#include <linux/slow-work.h>
#include <linux/perf_event.h>
#include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -74,15 +77,16 @@
#include <scsi/sg.h>
#endif
+#ifdef CONFIG_LOCKUP_DETECTOR
+#include <linux/nmi.h>
+#endif
+
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
extern int max_threads;
extern int core_uses_pid;
extern int suid_dumpable;
@@ -104,7 +108,7 @@ extern int blk_iopoll_enabled;
#endif
/* Constants used for minimum and maximum */
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
static int neg_one = -1;
#endif
@@ -128,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
+#endif
#ifdef CONFIG_SPARC
#include <asm/system.h>
#endif
@@ -204,9 +211,6 @@ static struct ctl_table fs_table[];
static struct ctl_table debug_table[];
static struct ctl_table dev_table[];
extern struct ctl_table random_table[];
-#ifdef CONFIG_INOTIFY_USER
-extern struct ctl_table inotify_table[];
-#endif
#ifdef CONFIG_EPOLL
extern struct ctl_table epoll_table[];
#endif
@@ -261,6 +265,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif
+#ifdef CONFIG_COMPACTION
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
+#endif
+
static struct ctl_table kern_table[] = {
{
.procname = "sched_child_runs_first",
@@ -555,7 +564,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#ifdef CONFIG_HOTPLUG
{
.procname = "hotplug",
.data = &uevent_helper,
@@ -703,7 +712,34 @@ static struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_LOCKUP_DETECTOR)
+ {
+ .procname = "watchdog",
+ .data = &watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dowatchdog_enabled,
+ },
+ {
+ .procname = "watchdog_thresh",
+ .data = &softlockup_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dowatchdog_thresh,
+ .extra1 = &neg_one,
+ .extra2 = &sixty,
+ },
+ {
+ .procname = "softlockup_panic",
+ .data = &softlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "unknown_nmi_panic",
.data = &unknown_nmi_panic,
@@ -806,26 +842,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
- {
- .procname = "softlockup_panic",
- .data = &softlockup_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
- },
- {
- .procname = "softlockup_thresh",
- .data = &softlockup_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dosoftlockup_thresh,
- .extra1 = &neg_one,
- .extra2 = &sixty,
- },
-#endif
#ifdef CONFIG_DETECT_HUNG_TASK
{
.procname = "hung_task_panic",
@@ -899,13 +915,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_SLOW_WORK
- {
- .procname = "slow-work",
- .mode = 0555,
- .child = slow_work_sysctls,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
{
.procname = "perf_event_paranoid",
@@ -1120,6 +1129,25 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
},
+#ifdef CONFIG_COMPACTION
+ {
+ .procname = "compact_memory",
+ .data = &sysctl_compact_memory,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = sysctl_compaction_handler,
+ },
+ {
+ .procname = "extfrag_threshold",
+ .data = &sysctl_extfrag_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_extfrag_handler,
+ .extra1 = &min_extfrag_threshold,
+ .extra2 = &max_extfrag_threshold,
+ },
+
+#endif /* CONFIG_COMPACTION */
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -1444,6 +1472,14 @@ static struct ctl_table fs_table[] = {
.child = binfmt_misc_table,
},
#endif
+ {
+ .procname = "pipe-max-size",
+ .data = &pipe_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &pipe_proc_fn,
+ .extra1 = &pipe_min_size,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -2083,20 +2119,20 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
#define TMPBUFLEN 22
/**
- * proc_get_long - reads an ASCII formated integer from a user buffer
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
*
- * @buf - a kernel buffer
- * @size - size of the kernel buffer
- * @val - this is where the number will be stored
- * @neg - set to %TRUE if number is negative
- * @perm_tr - a vector which contains the allowed trailers
- * @perm_tr_len - size of the perm_tr vector
- * @tr - pointer to store the trailer character
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
*
- * In case of success 0 is returned and buf and size are updated with
- * the amount of bytes read. If tr is non NULL and a trailing
- * character exist (size is non zero after returning from this
- * function) tr is updated with the trailing character.
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
*/
static int proc_get_long(char **buf, size_t *size,
unsigned long *val, bool *neg,
@@ -2147,15 +2183,15 @@ static int proc_get_long(char **buf, size_t *size,
}
/**
- * proc_put_long - coverts an integer to a decimal ASCII formated string
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
*
- * @buf - the user buffer
- * @size - the size of the user buffer
- * @val - the integer to be converted
- * @neg - sign of the number, %TRUE for negative
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
*
- * In case of success 0 is returned and buf and size are updated with
- * the amount of bytes read.
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes written.
*/
static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
bool neg)
@@ -2253,6 +2289,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (write) {
left -= proc_skip_spaces(&kbuf);
+ if (!left)
+ break;
err = proc_get_long(&kbuf, &left, &lval, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
@@ -2279,7 +2317,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
- if (write && !err)
+ if (write && !err && left)
left -= proc_skip_spaces(&kbuf);
free:
if (write) {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 937d31dc8566..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
#include <linux/file.h>
#include <linux/ctype.h>
#include <linux/netdevice.h>
+#include <linux/kernel.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL_SYSCALL
@@ -1124,11 +1125,6 @@ out:
return result;
}
-static unsigned hex_value(int ch)
-{
- return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
-}
-
static ssize_t bin_uuid(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
if (!isxdigit(str[0]) || !isxdigit(str[1]))
goto out;
- uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+ uuid[i] = (hex_to_bin(str[0]) << 4) |
+ hex_to_bin(str[1]);
str += 2;
if (*str == '-')
str++;
diff --git a/kernel/time.c b/kernel/time.c
index 50612faa9baf..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,10 +132,10 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
*/
static inline void warp_clock(void)
{
- struct timespec delta, adjust;
- delta.tv_sec = sys_tz.tz_minuteswest * 60;
- delta.tv_nsec = 0;
- adjust = timespec_add_safe(current_kernel_time(), delta);
+ struct timespec adjust;
+
+ adjust = current_kernel_time();
+ adjust.tv_sec += sys_tz.tz_minuteswest * 60;
do_settimeofday(&adjust);
}
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-#ifndef CONFIG_GENERIC_TIME
-/*
- * Simulate gettimeofday using do_gettimeofday which only allows a timeval
- * and therefore only yields usec accuracy
- */
-void getnstimeofday(struct timespec *tv)
-{
- struct timeval x;
-
- do_gettimeofday(&x);
- tv->tv_sec = x.tv_sec;
- tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-#endif
-
/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
config NO_HZ
bool "Tickless System (Dynamic Ticks)"
- depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
- depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
return max_nsecs - (max_nsecs >> 5);
}
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
/**
* clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
}
}
-#else /* CONFIG_GENERIC_TIME */
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
static inline void clocksource_select(void) { }
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
#define MAX_UPDATE_LENGTH 5 /* Seconds */
/**
- * __clocksource_register_scale - Used to install new clocksources
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
* @t: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
*
- * Returns -EBUSY if registration fails, zero otherwise.
+ * This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_register_hz() or clocksource_register_khz helper functions.
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
*/
-int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
-
/*
* Ideally we want to use some of the limits used in
* clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
NSEC_PER_SEC/scale,
MAX_UPDATE_LENGTH*scale);
cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+ /* Intialize mult/shift and max_idle_ns */
+ __clocksource_updatefreq_scale(cs, scale, freq);
+ /* Add clocksource to the clcoksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_select();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* Setup the next period for devices, which do not have
* periodic mode. We read dev->next_event first and add to it
- * when the event alrady expired. clockevents_program_event()
+ * when the event already expired. clockevents_program_event()
* sets dev->next_event only when the event is really
* programmed to the device.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now)
* Updates the per cpu time idle statistics counters
*/
static void
-update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- if (nr_iowait_cpu() > 0)
+ if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
ts->idle_entrytime = now;
}
@@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- update_ts_time_stats(ts, now, NULL);
+ update_ts_time_stats(cpu, ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
}
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
{
ktime_t now;
now = ktime_get();
- update_ts_time_stats(ts, now, NULL);
+ update_ts_time_stats(cpu, ts, now, NULL);
ts->idle_entrytime = now;
ts->idle_active = 1;
@@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- update_ts_time_stats(ts, ktime_get(), last_update_time);
+ update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
return ktime_to_us(ts->idle_sleeptime);
}
@@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- update_ts_time_stats(ts, ktime_get(), last_update_time);
+ update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
return ktime_to_us(ts->iowait_sleeptime);
}
@@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
*/
ts->inidle = 1;
- now = tick_nohz_start_idle(ts);
+ now = tick_nohz_start_idle(cpu, ts);
/*
* If this cpu is offline and it is the one which updates
@@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle)
goto end;
}
- if (nohz_ratelimit(cpu))
- goto end;
-
ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -408,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
- if (select_nohz_load_balancer(1)) {
- /*
- * sched tick not stopped!
- */
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
- goto out;
- }
+ select_nohz_load_balancer(1);
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -783,7 +774,6 @@ void tick_setup_sched_timer(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now = ktime_get();
- u64 offset;
/*
* Emulate tick processing via per-CPU hrtimers:
@@ -793,10 +783,6 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
- offset = ktime_to_ns(tick_period) >> 1;
- do_div(offset, num_possible_cpus());
- offset *= smp_processor_id();
- hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..e14c839e9faa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
* - wall_to_monotonic is no longer the boot time, getboottime must be
* used instead.
*/
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static struct timespec total_sleep_time;
/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
{
xtime.tv_sec += leapsecond;
wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
}
-#ifdef CONFIG_GENERIC_TIME
-
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
timekeeper.ntp_error = 0;
ntp_clear();
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
tick_clock_notify();
}
-#else /* GENERIC_TIME */
-
-static inline void timekeeping_forward_now(void) { }
-
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
- struct timespec now;
-
- ktime_get_ts(&now);
-
- return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts: pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
- struct timespec tomono;
- unsigned long seq;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- getnstimeofday(ts);
- tomono = wall_to_monotonic;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
-#endif /* !GENERIC_TIME */
-
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
*
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
- xtime = timespec_add_safe(xtime, ts);
+ xtime = timespec_add(xtime, ts);
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
- total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+ total_sleep_time = timespec_add(total_sleep_time, ts);
}
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -784,10 +738,11 @@ void update_wall_time(void)
return;
clock = timekeeper.clock;
-#ifdef CONFIG_GENERIC_TIME
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
-#else
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = timekeeper.cycle_interval;
+#else
+ offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
@@ -856,7 +811,8 @@ void update_wall_time(void)
}
/* check to see if there is a new clocksource to use */
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
}
/**
@@ -887,7 +843,7 @@ EXPORT_SYMBOL_GPL(getboottime);
*/
void monotonic_to_bootbased(struct timespec *ts)
{
- *ts = timespec_add_safe(*ts, total_sleep_time);
+ *ts = timespec_add(*ts, total_sleep_time);
}
EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
@@ -902,6 +858,11 @@ struct timespec __current_kernel_time(void)
return xtime;
}
+struct timespec __get_wall_to_monotonic(void)
+{
+ return wall_to_monotonic;
+}
+
struct timespec current_kernel_time(void)
{
struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index 9199f3c52215..f1b8afe1ad86 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/*
* Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
- * the new flag to indicate whether the timer is deferrable
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
*/
#define TBASE_DEFERRABLE_FLAG (0x1)
@@ -577,6 +582,19 @@ static void __init_timer(struct timer_list *timer,
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key,
+ void (*function)(unsigned long),
+ unsigned long data)
+{
+ timer->function = function;
+ timer->data = data;
+ init_timer_on_stack_key(timer, name, key);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
+
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
@@ -679,12 +697,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
cpu = smp_processor_id();
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
- int preferred_cpu = get_nohz_load_balancer();
-
- if (preferred_cpu >= 0)
- cpu = preferred_cpu;
- }
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+ cpu = get_nohz_timer_target();
#endif
new_base = per_cpu(tvec_bases, cpu);
@@ -750,13 +764,18 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
unsigned long expires_limit, mask;
int bit;
- expires_limit = expires + timer->slack;
+ expires_limit = expires;
- if (timer->slack < 0) /* auto slack: use 0.4% */
- expires_limit = expires + (expires - jiffies)/256;
+ if (timer->slack >= 0) {
+ expires_limit = expires + timer->slack;
+ } else {
+ unsigned long now = jiffies;
+ /* No slack, if already expired else auto slack 0.4% */
+ if (time_after(expires, now))
+ expires_limit = expires + (expires - now)/256;
+ }
mask = expires ^ expires_limit;
-
if (mask == 0)
return expires;
@@ -1284,7 +1303,6 @@ void run_local_timers(void)
{
hrtimer_run_queues();
raise_softirq(TIMER_SOFTIRQ);
- softlockup_tick();
}
/*
@@ -1679,11 +1697,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ int err;
+
switch(action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (init_timers_cpu(cpu) < 0)
- return NOTIFY_BAD;
+ err = init_timers_cpu(cpu);
+ if (err < 0)
+ return notifier_from_errno(err);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
@@ -1709,7 +1730,7 @@ void __init init_timers(void)
init_timer_stats();
- BUG_ON(err == NOTIFY_BAD);
+ BUG_ON(err != NOTIFY_OK);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -1742,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
}
EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+ ktime_t kmin;
+ unsigned long delta;
+
+ kmin = ktime_set(0, min * NSEC_PER_USEC);
+ delta = (max - min) * NSEC_PER_USEC;
+ return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPT
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
enabled. This option and the irqs-off timing option can be
used together or separately.)
-config SYSPROF_TRACER
- bool "Sysprof Tracer"
- depends on X86
- select GENERIC_TRACER
- select CONTEXT_SWITCH_TRACER
- help
- This tracer provides the trace needed by the 'Sysprof' userspace
- tool.
-
config SCHED_TRACER
bool "Scheduling Latency Tracer"
select GENERIC_TRACER
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
-config BOOT_TRACER
- bool "Trace boot initcalls"
- select GENERIC_TRACER
- select CONTEXT_SWITCH_TRACER
- help
- This tracer helps developers to optimize boot times: it records
- the timings of the initcalls and traces key events and the identity
- of tasks that can cause boot delays, such as context-switches.
-
- Its aim is to be parsed by the scripts/bootgraph.pl tool to
- produce pretty graphics about boot inefficiencies, giving a visual
- representation of the delays during initcalls - but the raw
- /debug/tracing/trace text output is readable too.
-
- You must pass in initcall_debug and ftrace=initcall to the kernel
- command line to enable this on bootup.
-
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -325,28 +299,6 @@ config BRANCH_TRACER
Say N if unsure.
-config KSYM_TRACER
- bool "Trace read and write access on kernel memory locations"
- depends on HAVE_HW_BREAKPOINT
- select TRACING
- help
- This tracer helps find read and write operations on any given kernel
- symbol i.e. /proc/kallsyms.
-
-config PROFILE_KSYM_TRACER
- bool "Profile all kernel memory accesses on 'watched' variables"
- depends on KSYM_TRACER
- help
- This tracer profiles kernel accesses on variables watched through the
- ksym tracer ftrace plugin. Depending upon the hardware, all read
- and write operations on kernel variables can be monitored for
- accesses.
-
- The results will be displayed in:
- /debugfs/tracing/profile_ksym
-
- Say N if unsure.
-
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
@@ -371,37 +323,6 @@ config STACK_TRACER
Say N if unsure.
-config KMEMTRACE
- bool "Trace SLAB allocations"
- select GENERIC_TRACER
- help
- kmemtrace provides tracing for slab allocator functions, such as
- kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
- data is then fed to the userspace application in order to analyse
- allocation hotspots, internal fragmentation and so on, making it
- possible to see how well an allocator performs, as well as debug
- and profile kernel code.
-
- This requires an userspace application to use. See
- Documentation/trace/kmemtrace.txt for more information.
-
- Saying Y will make the kernel somewhat larger and slower. However,
- if you disable kmemtrace at run-time or boot-time, the performance
- impact is minimal (depending on the arch the kernel is built for).
-
- If unsure, say N.
-
-config WORKQUEUE_TRACER
- bool "Trace workqueues"
- select GENERIC_TRACER
- help
- The workqueue tracer provides some statistical information
- about each cpu workqueue thread such as the number of the
- works inserted and executed since their creation. It can help
- to evaluate the amount of work each of them has to perform.
- For example it can help a developer to decide whether he should
- choose a per-cpu workqueue instead of a singlethreaded one.
-
config BLK_DEV_IO_TRACE
bool "Support for tracing block IO actions"
depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
-obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
-obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
}
}
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ABORT);
}
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_INSERT);
}
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
}
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
}
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
}
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
}
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
}
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
}
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
}
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
}
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
}
}
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
}
}
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
}
}
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+ struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
@@ -829,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
/**
* blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
* @dev: target device
@@ -839,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -859,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @rq: the source request
* @dev: target device
@@ -869,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+ struct request_queue *q,
struct request *rq, dev_t dev,
sector_t from)
{
@@ -921,64 +938,64 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
WARN_ON(ret);
- ret = register_trace_block_getrq(blk_add_trace_getrq);
+ ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
WARN_ON(ret);
- ret = register_trace_block_plug(blk_add_trace_plug);
+ ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
WARN_ON(ret);
- ret = register_trace_block_split(blk_add_trace_split);
+ ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap);
+ ret = register_trace_block_remap(blk_add_trace_remap, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+ ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
}
static void blk_unregister_tracepoints(void)
{
- unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
- unregister_trace_block_remap(blk_add_trace_remap);
- unregister_trace_block_split(blk_add_trace_split);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- unregister_trace_block_plug(blk_add_trace_plug);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
- unregister_trace_block_getrq(blk_add_trace_getrq);
- unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
- unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+ unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_split(blk_add_trace_split, NULL);
+ unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
@@ -1321,7 +1338,7 @@ out:
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return print_one_line(iter, false);
}
@@ -1343,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
}
static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return blk_trace_synthesize_old_trace(iter) ?
TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
.set_flag = blk_tracer_set_flag,
};
-static struct trace_event trace_blk_event = {
- .type = TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
.trace = blk_trace_event_print,
.binary = blk_trace_event_print_binary,
};
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .funcs = &trace_blk_event_funcs,
+};
+
static int __init init_blk_tracer(void)
{
if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 32837e19e3bd..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
struct hlist_head *hhd;
struct hlist_node *n;
unsigned long key;
- int resched;
key = hash_long(ip, FTRACE_HASH_BITS);
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
* period. This syncs the hash iteration and freeing of items
* on the hash. rcu_read_lock is too dangerous here.
*/
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
hlist_for_each_entry_rcu(entry, n, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -3234,7 +3233,8 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
int index;
@@ -3288,7 +3288,7 @@ static int start_graph_tracing(void)
} while (ret == -EAGAIN);
if (!ret) {
- ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
if (ret)
pr_info("ftrace_graph: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -3364,7 +3364,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index a91da69f153a..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL 0x1
-
-static struct tracer_opt kmem_opts[] = {
- /* Default disable the minimalistic output */
- { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
- { }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
- .val = 0,
- .opts = kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- struct ftrace_event_call *call = &event_kmem_alloc;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_alloc_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
-
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_ALLOC;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
- entry->bytes_req = bytes_req;
- entry->bytes_alloc = bytes_alloc;
- entry->gfp_flags = gfp_flags;
- entry->node = node;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr)
-{
- struct ftrace_event_call *call = &event_kmem_free;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_free_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_FREE;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
- int err;
-
- err = register_trace_kmalloc(kmemtrace_kmalloc);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- if (err)
- return err;
- err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- if (err)
- return err;
- err = register_trace_kfree(kmemtrace_kfree);
- if (err)
- return err;
- err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-
- return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
- unregister_trace_kmalloc(kmemtrace_kmalloc);
- unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
- unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- unregister_trace_kfree(kmemtrace_kfree);
- unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
- kmemtrace_array = tr;
-
- tracing_reset_online_cpus(tr);
-
- kmemtrace_start_probes();
-
- return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
- kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
- /* Don't need headers for the original kmemtrace output */
- if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
- return;
-
- seq_printf(s, "#\n");
- seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
- " POINTER NODE CALLER\n");
- seq_printf(s, "# FREE | | | | "
- " | | | |\n");
- seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC 0
-#define KMEMTRACE_USER_FREE 1
-
-struct kmemtrace_user_event {
- u8 event_id;
- u8 type_id;
- u16 event_size;
- u32 cpu;
- u64 timestamp;
- unsigned long call_site;
- unsigned long ptr;
-};
-
-struct kmemtrace_user_event_alloc {
- size_t bytes_req;
- size_t bytes_alloc;
- unsigned gfp_flags;
- int node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_alloc_entry *entry;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
- "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
- entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
- (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
- (unsigned long)entry->gfp_flags, entry->node);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_free_entry *entry;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
- entry->type_id, (void *)entry->call_site,
- (unsigned long)entry->ptr);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_alloc_entry *entry;
- struct kmemtrace_user_event *ev;
- struct kmemtrace_user_event_alloc *ev_alloc;
-
- trace_assign_type(entry, iter->ent);
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_ALLOC;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
- if (!ev_alloc)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev_alloc->bytes_req = entry->bytes_req;
- ev_alloc->bytes_alloc = entry->bytes_alloc;
- ev_alloc->gfp_flags = entry->gfp_flags;
- ev_alloc->node = entry->node;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_free_entry *entry;
- struct kmemtrace_user_event *ev;
-
- trace_assign_type(entry, iter->ent);
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_FREE;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
- struct kmemtrace_alloc_entry *entry;
- struct trace_seq *s = &iter->seq;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- /* Alloc entry */
- ret = trace_seq_printf(s, " + ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Requested */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Allocated */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Flags
- * TODO: would be better to see the name of the GFP flag names
- */
- ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Node and call site*/
- ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
- (void *)entry->call_site);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
- struct kmemtrace_free_entry *entry;
- struct trace_seq *s = &iter->seq;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- /* Free entry */
- ret = trace_seq_printf(s, " - ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip requested/allocated/flags */
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip node and print call site*/
- ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
-
- if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
- return TRACE_TYPE_UNHANDLED;
-
- switch (entry->type) {
- case TRACE_KMEM_ALLOC:
- return kmemtrace_print_alloc_compress(iter);
- case TRACE_KMEM_FREE:
- return kmemtrace_print_free_compress(iter);
- default:
- return TRACE_TYPE_UNHANDLED;
- }
-}
-
-static struct trace_event kmem_trace_alloc = {
- .type = TRACE_KMEM_ALLOC,
- .trace = kmemtrace_print_alloc,
- .binary = kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_free = {
- .type = TRACE_KMEM_FREE,
- .trace = kmemtrace_print_free,
- .binary = kmemtrace_print_free_user,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
- .name = "kmemtrace",
- .init = kmem_trace_init,
- .reset = kmem_trace_reset,
- .print_line = kmemtrace_print_line,
- .print_header = kmemtrace_headers,
- .flags = &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
- /* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
- if (!register_ftrace_event(&kmem_trace_alloc)) {
- pr_warning("Warning: could not register kmem events\n");
- return 1;
- }
-
- if (!register_ftrace_event(&kmem_trace_free)) {
- pr_warning("Warning: could not register kmem events\n");
- return 1;
- }
-
- if (register_tracer(&kmem_tracer) != 0) {
- pr_warning("Warning: could not register the kmem tracer\n");
- return 1;
- }
-
- return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f6059c5aa94..3632ce87674f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
*/
struct ring_buffer_per_cpu {
int cpu;
+ atomic_t record_disabled;
struct ring_buffer *buffer;
spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
unsigned long read;
u64 write_stamp;
u64 read_stamp;
- atomic_t record_disabled;
};
struct ring_buffer {
@@ -1768,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* must fill the old tail_page with padding.
*/
if (tail >= BUF_PAGE_SIZE) {
+ /*
+ * If the page was filled, then we still need
+ * to update the real_end. Reset it to zero
+ * and the reader will ignore it.
+ */
+ if (tail == BUF_PAGE_SIZE)
+ tail_page->real_end = 0;
+
local_sub(length, &tail_page->write);
return;
}
@@ -2234,8 +2242,6 @@ static void trace_recursive_unlock(void)
#endif
-static DEFINE_PER_CPU(int, rb_need_resched);
-
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
@@ -2256,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
- int cpu, resched;
+ int cpu;
if (ring_buffer_flags != RB_BUFFERS_ON)
return NULL;
/* If we are tracing schedule, we don't want to recurse */
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
if (atomic_read(&buffer->record_disabled))
goto out_nocheck;
@@ -2287,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
if (!event)
goto out;
- /*
- * Need to store resched state on this cpu.
- * Only the first needs to.
- */
-
- if (preempt_count() == 1)
- per_cpu(rb_need_resched, cpu) = resched;
-
return event;
out:
trace_recursive_unlock();
out_nocheck:
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2347,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
trace_recursive_unlock();
- /*
- * Only the last preempt count needs to restore preemption.
- */
- if (preempt_count() == 1)
- ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
- else
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
return 0;
}
@@ -2461,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
trace_recursive_unlock();
- /*
- * Only the last preempt count needs to restore preemption.
- */
- if (preempt_count() == 1)
- ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
- else
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
}
EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2493,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
struct ring_buffer_event *event;
void *body;
int ret = -EBUSY;
- int cpu, resched;
+ int cpu;
if (ring_buffer_flags != RB_BUFFERS_ON)
return -EBUSY;
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
if (atomic_read(&buffer->record_disabled))
goto out;
@@ -2528,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
ret = 0;
out:
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
return ret;
}
@@ -3894,12 +3880,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
ret = read;
cpu_buffer->lost_events = 0;
+
+ commit = local_read(&bpage->commit);
/*
* Set a flag in the commit field if we lost events
*/
if (missed_events) {
- commit = local_read(&bpage->commit);
-
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
@@ -3907,10 +3893,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
+ commit += sizeof(missed_events);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
}
+ /*
+ * This page may be off to user land. Zero it out here.
+ */
+ if (commit < BUF_PAGE_SIZE)
+ memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+
out_unlock:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 756d7283318b..ba14a22be4cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
preempt_enable();
}
-static cpumask_var_t __read_mostly tracing_buffer_mask;
-
-#define for_each_tracing_cpu(cpu) \
- for_each_cpu(cpu, tracing_buffer_mask)
+cpumask_var_t __read_mostly tracing_buffer_mask;
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME;
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
static int trace_stop_count;
static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
"latency-format",
"sleep-time",
"graph-time",
+ "record-cmd",
NULL
};
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
+ if (!current_trace->use_max_tr) {
+ WARN_ON_ONCE(1);
+ return;
+ }
arch_spin_lock(&ftrace_max_lock);
tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
+ if (!current_trace->use_max_tr) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
arch_spin_lock(&ftrace_max_lock);
ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
return -1;
}
- if (strlen(type->name) > MAX_TRACER_SIZE) {
+ if (strlen(type->name) >= MAX_TRACER_SIZE) {
pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
return -1;
}
- /*
- * When this gets called we hold the BKL which means that
- * preemption is disabled. Various trace selftests however
- * need to disable and enable preemption for successful tests.
- * So we drop the BKL here and grab it after the tests again.
- */
- unlock_kernel();
mutex_lock(&trace_types_lock);
tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
#endif
out_unlock:
- lock_kernel();
return ret;
}
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
#endif /* CONFIG_STACKTRACE */
-static void
-ftrace_trace_special(void *__tr,
- unsigned long arg1, unsigned long arg2, unsigned long arg3,
- int pc)
-{
- struct ftrace_event_call *call = &event_special;
- struct ring_buffer_event *event;
- struct trace_array *tr = __tr;
- struct ring_buffer *buffer = tr->buffer;
- struct special_entry *entry;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
- sizeof(*entry), 0, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->arg1 = arg1;
- entry->arg2 = arg2;
- entry->arg3 = arg3;
-
- if (!filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
- unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
- ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
- if (tracing_disabled)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
-
- if (likely(atomic_inc_return(&data->disabled) == 1))
- ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-
/**
* trace_vbprintk - write binary msg to tracing buffer
*
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct bprint_entry *entry;
unsigned long flags;
int disable;
- int resched;
int cpu, len = 0, size, pc;
if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
pause_graph_tracing();
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -1452,7 +1395,7 @@ out_unlock:
out:
atomic_dec_return(&data->disabled);
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
unpause_graph_tracing();
return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(trace_vprintk);
-enum trace_file_type {
- TRACE_FILE_LAT_FMT = 1,
- TRACE_FILE_ANNOTATE = 2,
-};
-
static void trace_iterator_increment(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
}
/* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
iter->ent = __find_next_entry(iter, &iter->cpu,
&iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
return NULL;
if (iter->idx < 0)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
else
ent = iter;
while (ent && iter->idx < i)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
iter->pos = *pos;
return ent;
}
-static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
struct trace_array *tr = iter->tr;
struct ring_buffer_event *event;
@@ -1936,7 +1874,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
}
if (event)
- return event->trace(iter, sym_flags);
+ return event->funcs->trace(iter, sym_flags, event);
if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
goto partial;
@@ -1962,7 +1900,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event)
- return event->raw(iter, 0);
+ return event->funcs->raw(iter, 0, event);
if (!trace_seq_printf(s, "%d ?\n", entry->type))
goto partial;
@@ -1989,7 +1927,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event) {
- enum print_line_t ret = event->hex(iter, 0);
+ enum print_line_t ret = event->funcs->hex(iter, 0, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -2014,7 +1952,8 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
}
event = ftrace_find_event(entry->type);
- return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+ return event ? event->funcs->binary(iter, 0, event) :
+ TRACE_TYPE_HANDLED;
}
int trace_empty(struct trace_iterator *iter)
@@ -2048,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
}
/* Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
@@ -2393,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.release = seq_release,
+ .llseek = seq_lseek,
};
/*
@@ -2486,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
.open = tracing_open_generic,
.read = tracing_cpumask_read,
.write = tracing_cpumask_write,
+ .llseek = generic_file_llseek,
};
static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2561,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
trace_flags |= mask;
else
trace_flags &= ~mask;
+
+ if (mask == TRACE_ITER_RECORD_CMD)
+ trace_event_enable_cmd_record(enabled);
}
static ssize_t
@@ -2652,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_readme_fops = {
.open = tracing_open_generic,
.read = tracing_readme_read,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -2702,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
static const struct file_operations tracing_saved_cmdlines_fops = {
.open = tracing_open_generic,
.read = tracing_saved_cmdlines_read,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -2797,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
if (ret < 0)
return ret;
+ if (!current_trace->use_max_tr)
+ goto out;
+
ret = ring_buffer_resize(max_tr.buffer, size);
if (ret < 0) {
int r;
@@ -2824,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
return ret;
}
+ max_tr.entries = size;
+ out:
global_trace.entries = size;
return ret;
}
+
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
*
@@ -2889,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
trace_branch_disable();
if (current_trace && current_trace->reset)
current_trace->reset(tr);
-
+ if (current_trace && current_trace->use_max_tr) {
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_resize(max_tr.buffer, 1);
+ max_tr.entries = 1;
+ }
destroy_trace_option_files(topts);
current_trace = t;
topts = create_trace_option_files(current_trace);
+ if (current_trace->use_max_tr) {
+ ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+ if (ret < 0)
+ goto out;
+ max_tr.entries = global_trace.entries;
+ }
if (t->init) {
ret = tracer_init(t, tr);
@@ -3031,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (iter->trace->pipe_open)
iter->trace->pipe_open(iter);
+ nonseekable_open(inode, filp);
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -3210,7 +3177,7 @@ waitagain:
trace_event_read_lock();
trace_access_lock(iter->cpu_file);
- while (find_next_entry_inc(iter) != NULL) {
+ while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
int len = iter->seq.len;
@@ -3293,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
rem -= count;
- if (!find_next_entry_inc(iter)) {
+ if (!trace_find_next_entry_inc(iter)) {
rem = 0;
iter->ent = NULL;
break;
@@ -3309,12 +3276,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
size_t len,
unsigned int flags)
{
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
struct trace_iterator *iter = filp->private_data;
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.flags = flags,
.ops = &tracing_pipe_buf_ops,
@@ -3325,6 +3292,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
size_t rem;
unsigned int i;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3346,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (ret <= 0)
goto out_err;
- if (!iter->ent && !find_next_entry_inc(iter)) {
+ if (!iter->ent && !trace_find_next_entry_inc(iter)) {
ret = -EFAULT;
goto out_err;
}
@@ -3355,23 +3325,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
- pages[i] = alloc_page(GFP_KERNEL);
- if (!pages[i])
+ for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ spd.pages[i] = alloc_page(GFP_KERNEL);
+ if (!spd.pages[i])
break;
rem = tracing_fill_pipe_page(rem, iter);
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
- page_address(pages[i]),
+ page_address(spd.pages[i]),
iter->seq.len);
if (ret < 0) {
- __free_page(pages[i]);
+ __free_page(spd.pages[i]);
break;
}
- partial[i].offset = 0;
- partial[i].len = iter->seq.len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = iter->seq.len;
trace_seq_init(&iter->seq);
}
@@ -3382,12 +3352,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
spd.nr_pages = i;
- return splice_to_pipe(pipe, &spd);
+ ret = splice_to_pipe(pipe, &spd);
+out:
+ splice_shrink_spd(pipe, &spd);
+ return ret;
out_err:
mutex_unlock(&iter->mutex);
-
- return ret;
+ goto out;
}
static ssize_t
@@ -3471,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
}
tracing_start();
- max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
return cnt;
@@ -3584,18 +3555,21 @@ static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_ctrl_fops = {
.open = tracing_open_generic,
.read = tracing_ctrl_read,
.write = tracing_ctrl_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_pipe_fops = {
@@ -3604,17 +3578,20 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
+ .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
.open = tracing_open_generic,
.read = tracing_entries_read,
.write = tracing_entries_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_mark_fops = {
.open = tracing_open_generic,
.write = tracing_mark_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations trace_clock_fops = {
@@ -3660,7 +3637,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
- unsigned int pos;
ssize_t ret;
size_t size;
@@ -3687,11 +3663,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0)
return 0;
- pos = ring_buffer_page_len(info->spare);
-
- if (pos < PAGE_SIZE)
- memset(info->spare + pos, 0, PAGE_SIZE - pos);
-
read:
size = PAGE_SIZE - info->read;
if (size > count)
@@ -3786,11 +3757,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
- struct partial_page partial[PIPE_BUFFERS];
- struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
@@ -3799,22 +3770,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int entries, size, i;
size_t ret;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
if (*ppos & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: previous read must page-align\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (len & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
- if (len < PAGE_SIZE)
- return -EINVAL;
+ if (len < PAGE_SIZE) {
+ ret = -EINVAL;
+ goto out;
+ }
len &= PAGE_MASK;
}
trace_access_lock(info->cpu);
entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
- for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -3869,11 +3846,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
else
ret = 0;
/* TODO: block */
- return ret;
+ goto out;
}
ret = splice_to_pipe(pipe, &spd);
-
+ splice_shrink_spd(pipe, &spd);
+out:
return ret;
}
@@ -3919,6 +3897,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_stats_fops = {
.open = tracing_open_generic,
.read = tracing_stats_read,
+ .llseek = generic_file_llseek,
};
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3955,6 +3934,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_dyn_info_fops = {
.open = tracing_open_generic,
.read = tracing_read_dyn_info,
+ .llseek = generic_file_llseek,
};
#endif
@@ -4108,6 +4088,7 @@ static const struct file_operations trace_options_fops = {
.open = tracing_open_generic,
.read = trace_options_read,
.write = trace_options_write,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -4159,6 +4140,7 @@ static const struct file_operations trace_options_core_fops = {
.open = tracing_open_generic,
.read = trace_options_core_read,
.write = trace_options_core_write,
+ .llseek = generic_file_llseek,
};
struct dentry *trace_create_file(const char *name,
@@ -4348,9 +4330,6 @@ static __init int tracer_init_debugfs(void)
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
-#ifdef CONFIG_SYSPROF_TRACER
- init_tracer_sysprof_debugfs(d_tracer);
-#endif
create_trace_options_dir();
@@ -4407,7 +4386,7 @@ static struct notifier_block trace_die_notifier = {
*/
#define KERN_TRACE KERN_EMERG
-static void
+void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
@@ -4422,6 +4401,13 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ iter->tr = &global_trace;
+ iter->trace = current_trace;
+ iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
+
static void
__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
{
@@ -4447,8 +4433,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (disable_tracing)
ftrace_kill();
+ trace_init_global_iter(&iter);
+
for_each_tracing_cpu(cpu) {
- atomic_inc(&global_trace.data[cpu]->disabled);
+ atomic_inc(&iter.tr->data[cpu]->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4497,7 +4485,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
- if (find_next_entry_inc(&iter) != NULL) {
+ if (trace_find_next_entry_inc(&iter) != NULL) {
int ret;
ret = print_trace_line(&iter);
@@ -4519,7 +4507,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&global_trace.data[cpu]->disabled);
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
tracing_on();
}
@@ -4568,16 +4556,14 @@ __init static int tracer_alloc_buffers(void)
#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(ring_buf_size,
- TRACE_BUFFER_FLAGS);
+ max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
if (!max_tr.buffer) {
printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
WARN_ON(1);
ring_buffer_free(global_trace.buffer);
goto out_free_cpumask;
}
- max_tr.entries = ring_buffer_size(max_tr.buffer);
- WARN_ON(max_tr.entries != global_trace.entries);
+ max_tr.entries = 1;
#endif
/* Allocate the first page for all buffers */
@@ -4590,9 +4576,6 @@ __init static int tracer_alloc_buffers(void)
register_tracer(&nop_trace);
current_trace = &nop_trace;
-#ifdef CONFIG_BOOT_TRACER
- register_tracer(&boot_tracer);
-#endif
/* All seems OK, enable tracing */
tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d1ce0bec1b3f..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
#include <linux/mmiotrace.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
-#include <trace/boot.h>
-#include <linux/kmemtrace.h>
#include <linux/hw_breakpoint.h>
-
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
@@ -25,30 +22,17 @@ enum trace_type {
TRACE_STACK,
TRACE_PRINT,
TRACE_BPRINT,
- TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
TRACE_BRANCH,
- TRACE_BOOT_CALL,
- TRACE_BOOT_RET,
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
- TRACE_KMEM_ALLOC,
- TRACE_KMEM_FREE,
TRACE_BLK,
- TRACE_KSYM,
__TRACE_LAST_TYPE,
};
-enum kmemtrace_type_id {
- KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
- KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
- KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
-};
-
-extern struct tracer boot_tracer;
#undef __field
#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
- IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
- IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
- IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
- IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
- TRACE_KMEM_ALLOC); \
- IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
- TRACE_KMEM_FREE); \
- IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
__ftrace_bad_type(); \
} while (0)
@@ -298,6 +274,7 @@ struct tracer {
struct tracer *next;
int print_max;
struct tracer_flags *flags;
+ int use_max_tr;
};
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
const struct file_operations *fops);
struct dentry *tracing_init_dentry(void);
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
struct ring_buffer_event;
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
struct task_struct *cur,
unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
int is_tracing_stopped(void);
+enum trace_file_type {
+ TRACE_FILE_LAT_FMT = 1,
+ TRACE_FILE_ANNOTATE = 2,
+};
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
-extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+#define for_each_tracing_cpu(cpu) \
+ for_each_cpu(cpu, tracing_buffer_mask)
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
@@ -405,12 +391,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags, int skip, int pc)
{
}
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_sched_switch(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_sysprof(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_ksym(struct tracer *trace,
- struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
@@ -617,6 +601,7 @@ enum trace_iterator_flags {
TRACE_ITER_LATENCY_FMT = 0x20000,
TRACE_ITER_SLEEP_TIME = 0x40000,
TRACE_ITER_GRAPH_TIME = 0x80000,
+ TRACE_ITER_RECORD_CMD = 0x100000,
};
/*
@@ -628,54 +613,6 @@ enum trace_iterator_flags {
extern struct tracer nop_trace;
-/**
- * ftrace_preempt_disable - disable preemption scheduler safe
- *
- * When tracing can happen inside the scheduler, there exists
- * cases that the tracing might happen before the need_resched
- * flag is checked. If this happens and the tracer calls
- * preempt_enable (after a disable), a schedule might take place
- * causing an infinite recursion.
- *
- * To prevent this, we read the need_resched flag before
- * disabling preemption. When we want to enable preemption we
- * check the flag, if it is set, then we call preempt_enable_no_resched.
- * Otherwise, we call preempt_enable.
- *
- * The rational for doing the above is that if need_resched is set
- * and we have yet to reschedule, we are either in an atomic location
- * (where we do not need to check for scheduling) or we are inside
- * the scheduler and do not want to resched.
- */
-static inline int ftrace_preempt_disable(void)
-{
- int resched;
-
- resched = need_resched();
- preempt_disable_notrace();
-
- return resched;
-}
-
-/**
- * ftrace_preempt_enable - enable preemption scheduler safe
- * @resched: the return value from ftrace_preempt_disable
- *
- * This is a scheduler safe way to enable preemption and not miss
- * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we are either inside an atomic or
- * are inside the scheduler (we would have already scheduled
- * otherwise). In this case, we do not want to call normal
- * preempt_enable, but preempt_enable_no_resched instead.
- */
-static inline void ftrace_preempt_enable(int resched)
-{
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
-}
-
#ifdef CONFIG_BRANCH_TRACER
extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
@@ -766,6 +703,8 @@ struct filter_pred {
int pop_n;
};
+extern struct list_head ftrace_common_fields;
+
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct ftrace_event_call *call,
@@ -778,12 +717,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
+
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) &&
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
!filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
@@ -792,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
return 0;
}
+extern void trace_event_enable_cmd_record(bool enable);
+
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * ring buffer based initcalls tracer
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/time.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *boot_trace;
-static bool pre_initcalls_finished;
-
-/* Tells the boot tracer that the pre_smp_initcalls are finished.
- * So we are ready .
- * It doesn't enable sched events tracing however.
- * You have to call enable_boot_trace to do so.
- */
-void start_boot_trace(void)
-{
- pre_initcalls_finished = true;
-}
-
-void enable_boot_trace(void)
-{
- if (boot_trace && pre_initcalls_finished)
- tracing_start_sched_switch_record();
-}
-
-void disable_boot_trace(void)
-{
- if (boot_trace && pre_initcalls_finished)
- tracing_stop_sched_switch_record();
-}
-
-static int boot_trace_init(struct trace_array *tr)
-{
- boot_trace = tr;
-
- if (!tr)
- return 0;
-
- tracing_reset_online_cpus(tr);
-
- tracing_sched_switch_assign_trace(tr);
- return 0;
-}
-
-static enum print_line_t
-initcall_call_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
- struct trace_seq *s = &iter->seq;
- struct trace_boot_call *field;
- struct boot_trace_call *call;
- u64 ts;
- unsigned long nsec_rem;
- int ret;
-
- trace_assign_type(field, entry);
- call = &field->boot_call;
- ts = iter->ts;
- nsec_rem = do_div(ts, NSEC_PER_SEC);
-
- ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
- (unsigned long)ts, nsec_rem, call->func, call->caller);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- else
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-initcall_ret_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
- struct trace_seq *s = &iter->seq;
- struct trace_boot_ret *field;
- struct boot_trace_ret *init_ret;
- u64 ts;
- unsigned long nsec_rem;
- int ret;
-
- trace_assign_type(field, entry);
- init_ret = &field->boot_ret;
- ts = iter->ts;
- nsec_rem = do_div(ts, NSEC_PER_SEC);
-
- ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
- "returned %d after %llu msecs\n",
- (unsigned long) ts,
- nsec_rem,
- init_ret->func, init_ret->result, init_ret->duration);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- else
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
-
- switch (entry->type) {
- case TRACE_BOOT_CALL:
- return initcall_call_print_line(iter);
- case TRACE_BOOT_RET:
- return initcall_ret_print_line(iter);
- default:
- return TRACE_TYPE_UNHANDLED;
- }
-}
-
-struct tracer boot_tracer __read_mostly =
-{
- .name = "initcall",
- .init = boot_trace_init,
- .reset = tracing_reset_online_cpus,
- .print_line = initcall_print_line,
-};
-
-void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
-{
- struct ftrace_event_call *call = &event_boot_call;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- struct trace_boot_call *entry;
- struct trace_array *tr = boot_trace;
-
- if (!tr || !pre_initcalls_finished)
- return;
-
- /* Get its name now since this function could
- * disappear because it is in the .init section.
- */
- sprint_symbol(bt->func, (unsigned long)fn);
- preempt_disable();
-
- buffer = tr->buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->boot_call = *bt;
- if (!filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
- preempt_enable();
-}
-
-void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
-{
- struct ftrace_event_call *call = &event_boot_ret;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- struct trace_boot_ret *entry;
- struct trace_array *tr = boot_trace;
-
- if (!tr || !pre_initcalls_finished)
- return;
-
- sprint_symbol(bt->func, (unsigned long)fn);
- preempt_disable();
-
- buffer = tr->buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->boot_ret = *bt;
- if (!filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
- preempt_enable();
-}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct trace_branch *field;
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
" |\n");
}
+static struct trace_event_functions trace_branch_funcs = {
+ .trace = trace_branch_print,
+};
+
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
- .trace = trace_branch_print,
+ .funcs = &trace_branch_funcs,
};
static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
u64 notrace trace_clock_local(void)
{
u64 clock;
- int resched;
/*
* sched_clock() is an architecture implemented, fast, scalable,
* lockless clock. It is not guaranteed to be coherent across
* CPUs, nor across CPU idle events.
*/
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
clock = sched_clock();
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
return clock;
}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
*/
u64 notrace trace_clock(void)
{
- return cpu_clock(raw_smp_processor_id());
+ return local_clock();
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
);
/*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-
- TRACE_SPECIAL,
-
- F_STRUCT(
- __field( unsigned long, arg1 )
- __field( unsigned long, arg2 )
- __field( unsigned long, arg3 )
- ),
-
- F_printk("(%08lx) (%08lx) (%08lx)",
- __entry->arg1, __entry->arg2, __entry->arg3)
-);
-
-/*
* Stack-trace entry:
*/
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
__entry->map_id, __entry->opcode)
);
-FTRACE_ENTRY(boot_call, trace_boot_call,
-
- TRACE_BOOT_CALL,
-
- F_STRUCT(
- __field_struct( struct boot_trace_call, boot_call )
- __field_desc( pid_t, boot_call, caller )
- __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
- ),
-
- F_printk("%d %s", __entry->caller, __entry->func)
-);
-
-FTRACE_ENTRY(boot_ret, trace_boot_ret,
-
- TRACE_BOOT_RET,
-
- F_STRUCT(
- __field_struct( struct boot_trace_ret, boot_ret )
- __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
- __field_desc( int, boot_ret, result )
- __field_desc( unsigned long, boot_ret, duration )
- ),
-
- F_printk("%s %d %lx",
- __entry->func, __entry->result, __entry->duration)
-);
#define TRACE_FUNC_SIZE 30
#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
__entry->func, __entry->file, __entry->correct)
);
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
- TRACE_KMEM_ALLOC,
-
- F_STRUCT(
- __field( enum kmemtrace_type_id, type_id )
- __field( unsigned long, call_site )
- __field( const void *, ptr )
- __field( size_t, bytes_req )
- __field( size_t, bytes_alloc )
- __field( gfp_t, gfp_flags )
- __field( int, node )
- ),
-
- F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
- " flags:%x node:%d",
- __entry->type_id, __entry->call_site, __entry->ptr,
- __entry->bytes_req, __entry->bytes_alloc,
- __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
- TRACE_KMEM_FREE,
-
- F_STRUCT(
- __field( enum kmemtrace_type_id, type_id )
- __field( unsigned long, call_site )
- __field( const void *, ptr )
- ),
-
- F_printk("type:%u call_site:%lx ptr:%p",
- __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
-FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
-
- TRACE_KSYM,
-
- F_STRUCT(
- __field( unsigned long, ip )
- __field( unsigned char, type )
- __array( char , cmd, TASK_COMM_LEN )
- __field( unsigned long, addr )
- ),
-
- F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
- (void *)__entry->ip, (unsigned int)__entry->type,
- (void *)__entry->addr, __entry->cmd)
-);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..000e6e85b445 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,7 @@
#include <linux/kprobes.h>
#include "trace.h"
-DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
-EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
-
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
-static char *perf_trace_buf;
-static char *perf_trace_buf_nmi;
+static char *perf_trace_buf[4];
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,57 +21,76 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
-static int perf_trace_event_enable(struct ftrace_event_call *event)
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
{
- char *buf;
+ struct hlist_head *list;
int ret = -ENOMEM;
+ int cpu;
- if (event->perf_refcount++ > 0)
+ p_event->tp_event = tp_event;
+ if (tp_event->perf_refcount++ > 0)
return 0;
- if (!total_ref_count) {
- buf = (char *)alloc_percpu(perf_trace_t);
- if (!buf)
- goto fail_buf;
+ list = alloc_percpu(struct hlist_head);
+ if (!list)
+ goto fail;
- rcu_assign_pointer(perf_trace_buf, buf);
+ for_each_possible_cpu(cpu)
+ INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
- buf = (char *)alloc_percpu(perf_trace_t);
- if (!buf)
- goto fail_buf_nmi;
+ tp_event->perf_events = list;
- rcu_assign_pointer(perf_trace_buf_nmi, buf);
- }
+ if (!total_ref_count) {
+ char *buf;
+ int i;
- ret = event->perf_event_enable(event);
- if (!ret) {
- total_ref_count++;
- return 0;
+ for (i = 0; i < 4; i++) {
+ buf = (char *)alloc_percpu(perf_trace_t);
+ if (!buf)
+ goto fail;
+
+ perf_trace_buf[i] = buf;
+ }
}
-fail_buf_nmi:
+ ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+ if (ret)
+ goto fail;
+
+ total_ref_count++;
+ return 0;
+
+fail:
if (!total_ref_count) {
- free_percpu(perf_trace_buf_nmi);
- free_percpu(perf_trace_buf);
- perf_trace_buf_nmi = NULL;
- perf_trace_buf = NULL;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+
+ if (!--tp_event->perf_refcount) {
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
}
-fail_buf:
- event->perf_refcount--;
return ret;
}
-int perf_trace_enable(int event_id)
+int perf_trace_init(struct perf_event *p_event)
{
- struct ftrace_event_call *event;
+ struct ftrace_event_call *tp_event;
+ int event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id && event->perf_event_enable &&
- try_module_get(event->mod)) {
- ret = perf_trace_event_enable(event);
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (tp_event->event.type == event_id &&
+ tp_event->class && tp_event->class->reg &&
+ try_module_get(tp_event->mod)) {
+ ret = perf_trace_event_init(tp_event, p_event);
break;
}
}
@@ -86,90 +99,82 @@ int perf_trace_enable(int event_id)
return ret;
}
-static void perf_trace_event_disable(struct ftrace_event_call *event)
+int perf_trace_enable(struct perf_event *p_event)
{
- char *buf, *nmi_buf;
-
- if (--event->perf_refcount > 0)
- return;
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct hlist_head *list;
- event->perf_event_disable(event);
+ list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!list))
+ return -EINVAL;
- if (!--total_ref_count) {
- buf = perf_trace_buf;
- rcu_assign_pointer(perf_trace_buf, NULL);
-
- nmi_buf = perf_trace_buf_nmi;
- rcu_assign_pointer(perf_trace_buf_nmi, NULL);
+ list = this_cpu_ptr(list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
- /*
- * Ensure every events in profiling have finished before
- * releasing the buffers
- */
- synchronize_sched();
+ return 0;
+}
- free_percpu(buf);
- free_percpu(nmi_buf);
- }
+void perf_trace_disable(struct perf_event *p_event)
+{
+ hlist_del_rcu(&p_event->hlist_entry);
}
-void perf_trace_disable(int event_id)
+void perf_trace_destroy(struct perf_event *p_event)
{
- struct ftrace_event_call *event;
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ int i;
mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
- perf_trace_event_disable(event);
- module_put(event->mod);
- break;
+ if (--tp_event->perf_refcount > 0)
+ goto out;
+
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+
+ /*
+ * Ensure our callback won't be called anymore. The buffers
+ * will be freed after that.
+ */
+ tracepoint_synchronize_unregister();
+
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+
+ if (!--total_ref_count) {
+ for (i = 0; i < 4; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
}
}
+out:
mutex_unlock(&event_mutex);
}
__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
- int *rctxp, unsigned long *irq_flags)
+ struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
- char *trace_buf, *raw_data;
- int pc, cpu;
+ unsigned long flags;
+ char *raw_data;
+ int pc;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
pc = preempt_count();
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(*irq_flags);
-
*rctxp = perf_swevent_get_recursion_context();
if (*rctxp < 0)
- goto err_recursion;
-
- cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference_sched(perf_trace_buf);
-
- if (!trace_buf)
- goto err;
+ return NULL;
- raw_data = per_cpu_ptr(trace_buf, cpu);
+ raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
entry = (struct trace_entry *)raw_data;
- tracing_generic_entry_update(entry, *irq_flags, pc);
+ local_save_flags(flags);
+ tracing_generic_entry_update(entry, flags, pc);
entry->type = type;
return raw_data;
-err:
- perf_swevent_put_recursion_context(*rctxp);
-err_recursion:
- local_irq_restore(*irq_flags);
- return NULL;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,10 +28,19 @@
DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+LIST_HEAD(ftrace_common_fields);
-int trace_define_field(struct ftrace_event_call *call, const char *type,
- const char *name, int offset, int size, int is_signed,
- int filter_type)
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+ if (!event_call->class->get_fields)
+ return &event_call->class->fields;
+ return event_call->class->get_fields(event_call);
+}
+
+static int __trace_define_field(struct list_head *head, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
{
struct ftrace_event_field *field;
@@ -56,7 +65,7 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
field->size = size;
field->is_signed = is_signed;
- list_add(&field->link, &call->fields);
+ list_add(&field->link, head);
return 0;
@@ -67,17 +76,32 @@ err:
return -ENOMEM;
}
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type);
+}
EXPORT_SYMBOL_GPL(trace_define_field);
#define __common_field(type, item) \
- ret = trace_define_field(call, #type, "common_" #item, \
- offsetof(typeof(ent), item), \
- sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ ret = __trace_define_field(&ftrace_common_fields, #type, \
+ "common_" #item, \
+ offsetof(typeof(ent), item), \
+ sizeof(ent.item), \
+ is_signed_type(type), FILTER_OTHER); \
if (ret) \
return ret;
-static int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(void)
{
int ret;
struct trace_entry ent;
@@ -94,8 +118,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
+ struct list_head *head;
- list_for_each_entry_safe(field, next, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
kfree(field->type);
kfree(field->name);
@@ -107,16 +133,63 @@ int trace_event_raw_init(struct ftrace_event_call *call)
{
int id;
- id = register_ftrace_event(call->event);
+ id = register_ftrace_event(&call->event);
if (!id)
return -ENODEV;
- call->id = id;
- INIT_LIST_HEAD(&call->fields);
return 0;
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return tracepoint_probe_register(call->name,
+ call->class->probe,
+ call);
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->name,
+ call->class->probe,
+ call);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return tracepoint_probe_register(call->name,
+ call->class->perf_probe,
+ call);
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->name,
+ call->class->perf_probe,
+ call);
+ return 0;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
+void trace_event_enable_cmd_record(bool enable)
+{
+ struct ftrace_event_call *call;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_cmdline_record();
+ call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ } else {
+ tracing_stop_cmdline_record();
+ call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ }
+ mutex_unlock(&event_mutex);
+}
+
static int ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
@@ -124,23 +197,29 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
switch (enable) {
case 0:
- if (call->enabled) {
- call->enabled = 0;
- tracing_stop_cmdline_record();
- call->unregfunc(call);
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ call->flags &= ~TRACE_EVENT_FL_ENABLED;
+ if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+ tracing_stop_cmdline_record();
+ call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ call->class->reg(call, TRACE_REG_UNREGISTER);
}
break;
case 1:
- if (!call->enabled) {
- tracing_start_cmdline_record();
- ret = call->regfunc(call);
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+ if (trace_flags & TRACE_ITER_RECORD_CMD) {
+ tracing_start_cmdline_record();
+ call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ ret = call->class->reg(call, TRACE_REG_REGISTER);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
"%s\n", call->name);
break;
}
- call->enabled = 1;
+ call->flags |= TRACE_EVENT_FL_ENABLED;
}
break;
}
@@ -171,15 +250,15 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class || !call->class->reg)
continue;
if (match &&
strcmp(match, call->name) != 0 &&
- strcmp(match, call->system) != 0)
+ strcmp(match, call->class->system) != 0)
continue;
- if (sub && strcmp(sub, call->system) != 0)
+ if (sub && strcmp(sub, call->class->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
@@ -297,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->regfunc)
+ if (call->class && call->class->reg)
return call;
}
@@ -328,7 +407,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
list_for_each_entry_continue(call, &ftrace_events, list) {
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
return call;
}
@@ -355,8 +434,8 @@ static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- seq_printf(m, "%s:", call->system);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->class->system);
seq_printf(m, "%s\n", call->name);
return 0;
@@ -387,7 +466,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
struct ftrace_event_call *call = filp->private_data;
char *buf;
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
buf = "1\n";
else
buf = "0\n";
@@ -450,10 +529,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class || !call->class->reg)
continue;
- if (system && strcmp(call->system, system) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -461,7 +540,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!call->enabled);
+ set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -519,31 +598,11 @@ out:
return ret;
}
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+static void print_event_fields(struct trace_seq *s, struct list_head *head)
{
- struct ftrace_event_call *call = filp->private_data;
struct ftrace_event_field *field;
- struct trace_seq *s;
- int common_field_count = 5;
- char *buf;
- int r = 0;
-
- if (*ppos)
- return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- trace_seq_init(s);
-
- trace_seq_printf(s, "name: %s\n", call->name);
- trace_seq_printf(s, "ID: %d\n", call->id);
- trace_seq_printf(s, "format:\n");
-
- list_for_each_entry_reverse(field, &call->fields, link) {
+ list_for_each_entry_reverse(field, head, link) {
/*
* Smartly shows the array type(except dynamic array).
* Normal:
@@ -557,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
array_descriptor = NULL;
if (!array_descriptor) {
- r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+ trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
"\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
} else {
- r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+ trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
"\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
array_descriptor, field->offset,
field->size, !!field->is_signed);
}
+ }
+}
- if (--common_field_count == 0)
- r = trace_seq_printf(s, "\n");
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ struct list_head *head;
+ struct trace_seq *s;
+ char *buf;
+ int r;
- if (!r)
- break;
- }
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
- if (r)
- r = trace_seq_printf(s, "\nprint fmt: %s\n",
- call->print_fmt);
+ trace_seq_init(s);
+
+ trace_seq_printf(s, "name: %s\n", call->name);
+ trace_seq_printf(s, "ID: %d\n", call->event.type);
+ trace_seq_printf(s, "format:\n");
+
+ /* print common fields */
+ print_event_fields(s, &ftrace_common_fields);
+
+ trace_seq_putc(s, '\n');
+
+ /* print event specific fields */
+ head = trace_get_fields(call);
+ print_event_fields(s, head);
+
+ r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt);
if (!r) {
/*
@@ -613,7 +697,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return -ENOMEM;
trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->id);
+ trace_seq_printf(s, "%d\n", call->event.type);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
@@ -919,14 +1003,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
const struct file_operations *filter,
const struct file_operations *format)
{
+ struct list_head *head;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->system, d_events);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ d_events = event_subsystem_dir(call->class->system, d_events);
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
@@ -935,26 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->regfunc)
+ if (call->class->reg)
trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id && call->perf_event_enable)
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg)
trace_create_file("id", 0444, call->dir, call,
id);
+#endif
- if (call->define_fields) {
- ret = trace_define_common_fields(call);
- if (!ret)
- ret = call->define_fields(call);
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ ret = call->class->define_fields(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
- trace_create_file("filter", 0644, call->dir, call,
- filter);
}
+ trace_create_file("filter", 0644, call->dir, call,
+ filter);
trace_create_file("format", 0444, call->dir, call,
format);
@@ -962,20 +1052,26 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return 0;
}
-static int __trace_add_event_call(struct ftrace_event_call *call)
+static int
+__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+ const struct file_operations *id,
+ const struct file_operations *enable,
+ const struct file_operations *filter,
+ const struct file_operations *format)
{
struct dentry *d_events;
int ret;
+ /* The linker may leave blanks */
if (!call->name)
return -EINVAL;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
- pr_warning("Could not initialize trace "
- "events/%s\n", call->name);
+ pr_warning("Could not initialize trace events/%s\n",
+ call->name);
return ret;
}
}
@@ -984,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
if (!d_events)
return -ENOENT;
- ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
- &ftrace_enable_fops, &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
+ ret = event_create_dir(call, d_events, id, enable, filter, format);
if (!ret)
list_add(&call->list, &ftrace_events);
+ call->mod = mod;
return ret;
}
@@ -998,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
{
int ret;
mutex_lock(&event_mutex);
- ret = __trace_add_event_call(call);
+ ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
mutex_unlock(&event_mutex);
return ret;
}
@@ -1035,13 +1133,13 @@ static void remove_subsystem_dir(const char *name)
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
debugfs_remove_recursive(call->dir);
list_del(&call->list);
trace_destroy_fields(call);
destroy_preds(call);
- remove_subsystem_dir(call->system);
+ remove_subsystem_dir(call->class->system);
}
/* Remove an event_call */
@@ -1115,8 +1213,6 @@ static void trace_module_add_events(struct module *mod)
{
struct ftrace_module_file_ops *file_ops = NULL;
struct ftrace_event_call *call, *start, *end;
- struct dentry *d_events;
- int ret;
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1124,38 +1220,14 @@ static void trace_module_add_events(struct module *mod)
if (start == end)
return;
- d_events = event_trace_events_dir();
- if (!d_events)
+ file_ops = trace_create_file_ops(mod);
+ if (!file_ops)
return;
for_each_event(call, start, end) {
- /* The linker may leave blanks */
- if (!call->name)
- continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
- if (ret < 0) {
- if (ret != -ENOSYS)
- pr_warning("Could not initialize trace "
- "point events/%s\n", call->name);
- continue;
- }
- }
- /*
- * This module has events, create file ops for this module
- * if not already done.
- */
- if (!file_ops) {
- file_ops = trace_create_file_ops(mod);
- if (!file_ops)
- return;
- }
- call->mod = mod;
- ret = event_create_dir(call, d_events,
+ __trace_add_event_call(call, mod,
&file_ops->id, &file_ops->enable,
&file_ops->filter, &file_ops->format);
- if (!ret)
- list_add(&call->list, &ftrace_events);
}
}
@@ -1282,25 +1354,14 @@ static __init int event_trace_init(void)
trace_create_file("enable", 0644, d_events,
NULL, &ftrace_system_enable_fops);
+ if (trace_define_common_fields())
+ pr_warning("tracing: Failed to allocate common fields");
+
for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- /* The linker may leave blanks */
- if (!call->name)
- continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
- if (ret < 0) {
- if (ret != -ENOSYS)
- pr_warning("Could not initialize trace "
- "point events/%s\n", call->name);
- continue;
- }
- }
- ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+ __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
&ftrace_enable_fops,
&ftrace_event_filter_fops,
&ftrace_event_format_fops);
- if (!ret)
- list_add(&call->list, &ftrace_events);
}
while (true) {
@@ -1388,8 +1449,8 @@ static __init void event_trace_self_tests(void)
list_for_each_entry(call, &ftrace_events, list) {
- /* Only test those that have a regfunc */
- if (!call->regfunc)
+ /* Only test those that have a probe */
+ if (!call->class || !call->class->probe)
continue;
/*
@@ -1399,8 +1460,8 @@ static __init void event_trace_self_tests(void)
* syscalls as we test.
*/
#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
- if (call->system &&
- strcmp(call->system, "syscalls") == 0)
+ if (call->class->system &&
+ strcmp(call->class->system, "syscalls") == 0)
continue;
#endif
@@ -1410,7 +1471,7 @@ static __init void event_trace_self_tests(void)
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->enabled) {
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
@@ -1487,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
struct ftrace_entry *entry;
unsigned long flags;
long disabled;
- int resched;
int cpu;
int pc;
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1514,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1f..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,11 +497,11 @@ void print_subsystem_event_filter(struct event_subsystem *system,
}
static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
+__find_event_field(struct list_head *head, char *name)
{
struct ftrace_event_field *field;
- list_for_each_entry(field, &call->fields, link) {
+ list_for_each_entry(field, head, link) {
if (!strcmp(field->name, name))
return field;
}
@@ -509,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
return NULL;
}
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ field = __find_event_field(&ftrace_common_fields, name);
+ if (field)
+ return field;
+
+ head = trace_get_fields(call);
+ return __find_event_field(head, name);
+}
+
static void filter_free_pred(struct filter_pred *pred)
{
if (!pred)
@@ -545,7 +559,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
struct event_filter *filter = call->filter;
int i;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
filter->n_preds = 0;
for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +586,7 @@ void destroy_preds(struct ftrace_event_call *call)
{
__free_preds(call->filter);
call->filter = NULL;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
static struct event_filter *__alloc_preds(void)
@@ -611,7 +625,7 @@ static int init_preds(struct ftrace_event_call *call)
if (call->filter)
return 0;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
call->filter = __alloc_preds();
if (IS_ERR(call->filter))
return PTR_ERR(call->filter);
@@ -625,10 +639,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
int err;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
err = init_preds(call);
@@ -644,10 +655,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
struct ftrace_event_call *call;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
filter_disable_preds(call);
@@ -1249,10 +1257,7 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(call, &ftrace_events, list) {
struct event_filter *filter = call->filter;
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
/* try to see if the filter can be applied */
@@ -1266,7 +1271,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
filter_disable_preds(call);
else {
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
replace_filter_string(filter, filter_string);
}
fail = false;
@@ -1315,7 +1320,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (err)
append_filter_err(ps, call->filter);
else
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1393,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (call->id == event_id)
+ if (call->event.type == event_id)
break;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#include "trace_entries.h"
-static int ftrace_raw_init_event(struct ftrace_event_call *call)
-{
- INIT_LIST_HEAD(&call->fields);
- return 0;
-}
-
#undef __entry
#define __entry REC
@@ -153,17 +147,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
#define F_printk(fmt, args...) #fmt ", " __stringify(args)
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
+ \
+struct ftrace_event_class event_class_ftrace_##call = { \
+ .system = __stringify(TRACE_SYSTEM), \
+ .define_fields = ftrace_define_fields_##call, \
+ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+}; \
\
struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
- .id = type, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event, \
+ .event.type = etype, \
+ .class = &event_class_ftrace_##call, \
.print_fmt = print, \
- .define_fields = ftrace_define_fields_##call, \
}; \
#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
- int cpu, resched;
+ int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
trace_function(tr, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index dd11c830eb84..6bff23625781 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
- snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+ snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
+ nsecs_rem);
ret = trace_seq_printf(s, ".%s", nsecs_str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1025,7 +1026,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (!event)
return TRACE_TYPE_UNHANDLED;
- ret = event->trace(iter, sym_flags);
+ ret = event->funcs->trace(iter, sym_flags, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -1112,7 +1113,8 @@ print_graph_function(struct trace_iterator *iter)
}
static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags)
+print_graph_function_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return print_graph_function(iter);
}
@@ -1225,14 +1227,18 @@ void graph_trace_close(struct trace_iterator *iter)
}
}
+static struct trace_event_functions graph_functions = {
+ .trace = print_graph_function_event,
+};
+
static struct trace_event graph_trace_entry_event = {
.type = TRACE_GRAPH_ENT,
- .trace = print_graph_function_event,
+ .funcs = &graph_functions,
};
static struct trace_event graph_trace_ret_event = {
.type = TRACE_GRAPH_RET,
- .trace = print_graph_function_event,
+ .funcs = &graph_functions
};
static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_irqsoff(trace) register_tracer(&trace)
#else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_preemptoff(trace) register_tracer(&trace)
#else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "../debug/kdb/kdb_private.h"
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ unsigned int old_userobj;
+ int cnt = 0, cpu;
+
+ trace_init_global_iter(&iter);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&iter.tr->data[cpu]->disabled);
+ }
+
+ old_userobj = trace_flags;
+
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ kdb_printf("Dumping ftrace buffer:\n");
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ for_each_tracing_cpu(cpu) {
+ iter.buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ ring_buffer_read_start(iter.buffer_iter[cpu]);
+ tracing_iter_reset(&iter, cpu);
+ }
+ } else {
+ iter.cpu_file = cpu_file;
+ iter.buffer_iter[cpu_file] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+ ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+ tracing_iter_reset(&iter, cpu_file);
+ }
+ if (!trace_empty(&iter))
+ trace_find_next_entry_inc(&iter);
+ while (!trace_empty(&iter)) {
+ if (!cnt)
+ kdb_printf("---------------------------------\n");
+ cnt++;
+
+ if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ print_trace_line(&iter);
+ if (!skip_lines)
+ trace_printk_seq(&iter.seq);
+ else
+ skip_lines--;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ goto out;
+ }
+
+ if (!cnt)
+ kdb_printf(" (ftrace buffer empty)\n");
+ else
+ kdb_printf("---------------------------------\n");
+
+out:
+ trace_flags = old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
+ }
+
+ for_each_tracing_cpu(cpu)
+ if (iter.buffer_iter[cpu])
+ ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+ int skip_lines = 0;
+ long cpu_file;
+ char *cp;
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ skip_lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ skip_lines = 0;
+ }
+
+ if (argc == 2) {
+ cpu_file = simple_strtol(argv[2], &cp, 0);
+ if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ !cpu_online(cpu_file))
+ return KDB_BADINT;
+ } else {
+ cpu_file = TRACE_PIPE_ALL_CPU;
+ }
+
+ kdb_trap_printk++;
+ ftrace_dump_buf(skip_lines, cpu_file);
+ kdb_trap_printk--;
+
+ return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+ kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ return 0;
+}
+
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..8b27c9849b42 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
#include <linux/ptrace.h>
#include <linux/perf_event.h>
#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -38,6 +40,7 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
#define KPROBE_EVENT_SYSTEM "kprobes"
/* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
};
/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+ void *);
#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
/* Printing in basic type function template */
#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, void *data)\
+ const char *name, \
+ void *data, void *ent)\
{ \
return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
} \
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs) \
+ (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl) ((u32)(dl) >> 16)
+#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
+
+static inline void *get_rloc_data(u32 *dl)
+{
+ return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+ return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/*
+ * Convert data_rloc to data_loc:
+ * data_rloc stores the offset from data_rloc itself, but data_loc
+ * stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+ const char *name,
+ void *data, void *ent)
+{
+ int len = *(u32 *)data >> 16;
+
+ if (!len)
+ return trace_seq_printf(s, " %s=(fault)", name);
+ else
+ return trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
/* Data fetch function type */
typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
return fprm->fn(regs, fprm->data, dest);
}
-#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
+#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
/*
* Define macro for basic types - we don't need to define s* types, because
* we have to care only about bitwidth at recording time.
*/
-#define DEFINE_BASIC_FETCH_FUNCS(kind) \
-DEFINE_FETCH_##kind(u8) \
-DEFINE_FETCH_##kind(u16) \
-DEFINE_FETCH_##kind(u32) \
-DEFINE_FETCH_##kind(u64)
-
-#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
- ((FETCH_FUNC_NAME(kind, u8) == fn) || \
- (FETCH_FUNC_NAME(kind, u16) == fn) || \
- (FETCH_FUNC_NAME(kind, u32) == fn) || \
- (FETCH_FUNC_NAME(kind, u64) == fn))
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8) \
+DEFINE_FETCH_##method(u16) \
+DEFINE_FETCH_##method(u32) \
+DEFINE_FETCH_##method(u64)
+
+#define CHECK_FETCH_FUNCS(method, fn) \
+ (((FETCH_FUNC_NAME(method, u8) == fn) || \
+ (FETCH_FUNC_NAME(method, u16) == fn) || \
+ (FETCH_FUNC_NAME(method, u32) == fn) || \
+ (FETCH_FUNC_NAME(method, u64) == fn) || \
+ (FETCH_FUNC_NAME(method, string) == fn) || \
+ (FETCH_FUNC_NAME(method, string_size) == fn)) \
+ && (fn != NULL))
/* Data fetch function templates */
#define DEFINE_FETCH_reg(type) \
static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
+ void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_register(regs, \
(unsigned int)((unsigned long)offset)); \
}
DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
#define DEFINE_FETCH_stack(type) \
static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
(unsigned int)((unsigned long)offset)); \
}
DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
#define DEFINE_FETCH_retval(type) \
static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
*(type *)dest = (type)regs_return_value(regs); \
}
DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
#define DEFINE_FETCH_memory(type) \
static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
*(type *)dest = retval; \
}
DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ u8 *src = addr;
+ mm_segment_t old_fs = get_fs();
+ if (!maxlen)
+ return;
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ do
+ ret = __copy_from_user_inatomic(dst++, src++, 1);
+ while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+ dst[-1] = '\0';
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+ } else
+ *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+ get_rloc_offs(*(u32 *)dest));
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ int ret, len = 0;
+ u8 c;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ do {
+ ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) /* Failed to check the length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
/* Memory fetching by symbol */
struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
}
DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
/* Dereference memory access function */
struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
}
DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
{
- if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
free_deref_fetch_param(data->orig.data);
- else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
free_symbol_cache(data->orig.data);
kfree(data);
}
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-#define ASSIGN_FETCH_FUNC(kind, type) \
- .kind = FETCH_FUNC_NAME(kind, type)
-
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
- {.name = #ptype, \
- .size = sizeof(ftype), \
- .is_signed = sign, \
- .print = PRINT_TYPE_FUNC_NAME(ptype), \
- .fmt = PRINT_TYPE_FMT_NAME(ptype), \
-ASSIGN_FETCH_FUNC(reg, ftype), \
-ASSIGN_FETCH_FUNC(stack, ftype), \
-ASSIGN_FETCH_FUNC(retval, ftype), \
-ASSIGN_FETCH_FUNC(memory, ftype), \
-ASSIGN_FETCH_FUNC(symbol, ftype), \
-ASSIGN_FETCH_FUNC(deref, ftype), \
+/* Fetch types */
+enum {
+ FETCH_MTD_reg = 0,
+ FETCH_MTD_stack,
+ FETCH_MTD_retval,
+ FETCH_MTD_memory,
+ FETCH_MTD_symbol,
+ FETCH_MTD_deref,
+ FETCH_MTD_END,
+};
+
+#define ASSIGN_FETCH_FUNC(method, type) \
+ [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ {.name = _name, \
+ .size = _size, \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ .fmttype = _fmttype, \
+ .fetch = { \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ } \
}
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
/* Fetch type information table */
static const struct fetch_type {
const char *name; /* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
int is_signed; /* Signed flag */
print_type_func_t print; /* Print functions */
const char *fmt; /* Fromat string */
+ const char *fmttype; /* Name in format file */
/* Fetch functions */
- fetch_func_t reg;
- fetch_func_t stack;
- fetch_func_t retval;
- fetch_func_t memory;
- fetch_func_t symbol;
- fetch_func_t deref;
+ fetch_func_t fetch[FETCH_MTD_END];
} fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
ASSIGN_FETCH_TYPE(u8, u8, 0),
ASSIGN_FETCH_TYPE(u16, u16, 0),
ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
*(unsigned long *)dest = kernel_stack_pointer(regs);
}
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+ fetch_func_t orig_fn)
+{
+ int i;
+
+ if (type != &fetch_type_table[FETCH_TYPE_STRING])
+ return NULL; /* Only string type needs size function */
+ for (i = 0; i < FETCH_MTD_END; i++)
+ if (type->fetch[i] == orig_fn)
+ return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+
+ WARN_ON(1); /* This should not happen */
+ return NULL;
+}
+
/**
* Kprobe event core functions
*/
struct probe_arg {
struct fetch_param fetch;
+ struct fetch_param fetch_size;
unsigned int offset; /* Offset from argument entry */
const char *name; /* Name of this argument */
const char *comm; /* Command of this argument */
@@ -324,8 +482,8 @@ struct trace_probe {
unsigned long nhit;
unsigned int flags; /* For TP_FLAG_* */
const char *symbol; /* symbol name */
+ struct ftrace_event_class class;
struct ftrace_event_call call;
- struct trace_event event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -404,6 +562,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
+ tp->call.class = &tp->class;
tp->call.name = kstrdup(event, GFP_KERNEL);
if (!tp->call.name)
goto error;
@@ -413,8 +572,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
- tp->call.system = kstrdup(group, GFP_KERNEL);
- if (!tp->call.system)
+ tp->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->class.system)
goto error;
INIT_LIST_HEAD(&tp->list);
@@ -428,9 +587,9 @@ error:
static void free_probe_arg(struct probe_arg *arg)
{
- if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+ if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
free_deref_fetch_param(arg->fetch.data);
- else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
free_symbol_cache(arg->fetch.data);
kfree(arg->name);
kfree(arg->comm);
@@ -443,7 +602,7 @@ static void free_trace_probe(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
free_probe_arg(&tp->args[i]);
- kfree(tp->call.system);
+ kfree(tp->call.class->system);
kfree(tp->call.name);
kfree(tp->symbol);
kfree(tp);
@@ -456,7 +615,7 @@ static struct trace_probe *find_probe_event(const char *event,
list_for_each_entry(tp, &probe_list, list)
if (strcmp(tp->call.name, event) == 0 &&
- strcmp(tp->call.system, group) == 0)
+ strcmp(tp->call.class->system, group) == 0)
return tp;
return NULL;
}
@@ -481,7 +640,7 @@ static int register_trace_probe(struct trace_probe *tp)
mutex_lock(&probe_lock);
/* register as an event */
- old_tp = find_probe_event(tp->call.name, tp->call.system);
+ old_tp = find_probe_event(tp->call.name, tp->call.class->system);
if (old_tp) {
/* delete old event */
unregister_trace_probe(old_tp);
@@ -547,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
if (strcmp(arg, "retval") == 0) {
if (is_return)
- f->fn = t->retval;
+ f->fn = t->fetch[FETCH_MTD_retval];
else
ret = -EINVAL;
} else if (strncmp(arg, "stack", 5) == 0) {
@@ -561,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
if (ret || param > PARAM_MAX_STACK)
ret = -EINVAL;
else {
- f->fn = t->stack;
+ f->fn = t->fetch[FETCH_MTD_stack];
f->data = (void *)param;
}
} else
@@ -587,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
case '%': /* named register */
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
- f->fn = t->reg;
+ f->fn = t->fetch[FETCH_MTD_reg];
f->data = (void *)(unsigned long)ret;
ret = 0;
}
@@ -597,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
ret = strict_strtoul(arg + 1, 0, &param);
if (ret)
break;
- f->fn = t->memory;
+ f->fn = t->fetch[FETCH_MTD_memory];
f->data = (void *)param;
} else {
ret = split_symbol_offset(arg + 1, &offset);
@@ -605,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
break;
f->data = alloc_symbol_cache(arg + 1, offset);
if (f->data)
- f->fn = t->symbol;
+ f->fn = t->fetch[FETCH_MTD_symbol];
}
break;
case '+': /* deref memory */
@@ -635,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
if (ret)
kfree(dprm);
else {
- f->fn = t->deref;
+ f->fn = t->fetch[FETCH_MTD_deref];
f->data = (void *)dprm;
}
}
break;
}
- if (!ret && !f->fn)
+ if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
+ pr_info("%s type has no corresponding fetch method.\n",
+ t->name);
ret = -EINVAL;
+ }
return ret;
}
@@ -651,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
struct probe_arg *parg, int is_return)
{
const char *t;
+ int ret;
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
@@ -673,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
}
parg->offset = tp->size;
tp->size += parg->type->size;
- return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+ ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+ if (ret >= 0) {
+ parg->fetch_size.fn = get_fetch_size_function(parg->type,
+ parg->fetch.fn);
+ parg->fetch_size.data = parg->fetch.data;
+ }
+ return ret;
}
/* Return 1 if name is reserved or already used by another argument */
@@ -756,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
pr_info("Delete command needs an event name.\n");
return -EINVAL;
}
+ mutex_lock(&probe_lock);
tp = find_probe_event(event, group);
if (!tp) {
+ mutex_unlock(&probe_lock);
pr_info("Event %s/%s doesn't exist.\n", group, event);
return -ENOENT;
}
/* delete an event */
unregister_trace_probe(tp);
free_trace_probe(tp);
+ mutex_unlock(&probe_lock);
return 0;
}
@@ -904,7 +1076,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+ seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
if (!tp->symbol)
seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1042,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
+/* Sum up total data length for dynamic arraies (strings) */
+static __kprobes int __get_data_size(struct trace_probe *tp,
+ struct pt_regs *regs)
+{
+ int i, ret = 0;
+ u32 len;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ call_fetch(&tp->args[i].fetch_size, regs, &len);
+ ret += len;
+ }
+
+ return ret;
+}
+
+/* Store the value of each argument */
+static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
+ struct pt_regs *regs,
+ u8 *data, int maxlen)
+{
+ int i;
+ u32 end = tp->size;
+ u32 *dl; /* Data (relative) location */
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ /*
+ * First, we set the relative location and
+ * maximum data length to *dl
+ */
+ dl = (u32 *)(data + tp->args[i].offset);
+ *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+ /* Then try to fetch string or dynamic array data */
+ call_fetch(&tp->args[i].fetch, regs, dl);
+ /* Reduce maximum length */
+ end += get_rloc_len(*dl);
+ maxlen -= get_rloc_len(*dl);
+ /* Trick here, convert data_rloc to data_loc */
+ *dl = convert_rloc_to_loc(*dl,
+ ent_size + tp->args[i].offset);
+ } else
+ /* Just fetching data normally */
+ call_fetch(&tp->args[i].fetch, regs,
+ data + tp->args[i].offset);
+ }
+}
+
/* Kprobe handler */
static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
@@ -1049,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- u8 *data;
- int size, i, pc;
+ int size, dsize, pc;
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;
@@ -1059,18 +1278,17 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
local_save_flags(irq_flags);
pc = preempt_count();
- size = sizeof(*entry) + tp->size;
+ dsize = __get_data_size(tp, regs);
+ size = sizeof(*entry) + tp->size + dsize;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ip = (unsigned long)kp->addr;
- data = (u8 *)&entry[1];
- for (i = 0; i < tp->nr_args; i++)
- call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1084,27 +1302,25 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- u8 *data;
- int size, i, pc;
+ int size, pc, dsize;
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;
local_save_flags(irq_flags);
pc = preempt_count();
- size = sizeof(*entry) + tp->size;
+ dsize = __get_data_size(tp, regs);
+ size = sizeof(*entry) + tp->size + dsize;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- data = (u8 *)&entry[1];
- for (i = 0; i < tp->nr_args; i++)
- call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1112,18 +1328,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
/* Event entry printers */
enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
- data + tp->args[i].offset))
+ data + tp->args[i].offset, field))
goto partial;
if (!trace_seq_puts(s, "\n"))
@@ -1149,18 +1364,17 @@ partial:
}
enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kretprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kretprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1180,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
- data + tp->args[i].offset))
+ data + tp->args[i].offset, field))
goto partial;
if (!trace_seq_puts(s, "\n"))
@@ -1215,13 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
}
}
-static int probe_event_raw_init(struct ftrace_event_call *event_call)
-{
- INIT_LIST_HEAD(&event_call->fields);
-
- return 0;
-}
-
#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
do { \
@@ -1242,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
/* Set argument names as fields */
for (i = 0; i < tp->nr_args; i++) {
- ret = trace_define_field(event_call, tp->args[i].type->name,
+ ret = trace_define_field(event_call, tp->args[i].type->fmttype,
tp->args[i].name,
sizeof(field) + tp->args[i].offset,
tp->args[i].type->size,
@@ -1264,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
/* Set argument names as fields */
for (i = 0; i < tp->nr_args; i++) {
- ret = trace_define_field(event_call, tp->args[i].type->name,
+ ret = trace_define_field(event_call, tp->args[i].type->fmttype,
tp->args[i].name,
sizeof(field) + tp->args[i].offset,
tp->args[i].type->size,
@@ -1304,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
for (i = 0; i < tp->nr_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
- tp->args[i].name);
+ if (strcmp(tp->args[i].type->name, "string") == 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_str(%s)",
+ tp->args[i].name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+ tp->args[i].name);
}
#undef LEN_OR_ZERO
@@ -1341,28 +1553,28 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry_head *entry;
- u8 *data;
- int size, __size, i;
- unsigned long irq_flags;
+ struct hlist_head *head;
+ int size, __size, dsize;
int rctx;
- __size = sizeof(*entry) + tp->size;
+ dsize = __get_data_size(tp, regs);
+ __size = sizeof(*entry) + tp->size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
entry->ip = (unsigned long)kp->addr;
- data = (u8 *)&entry[1];
- for (i = 0; i < tp->nr_args; i++)
- call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+ memset(&entry[1], 0, dsize);
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
}
/* Kretprobe profile handler */
@@ -1372,30 +1584,28 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry_head *entry;
- u8 *data;
- int size, __size, i;
- unsigned long irq_flags;
+ struct hlist_head *head;
+ int size, __size, dsize;
int rctx;
- __size = sizeof(*entry) + tp->size;
+ dsize = __get_data_size(tp, regs);
+ __size = sizeof(*entry) + tp->size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- data = (u8 *)&entry[1];
- for (i = 0; i < tp->nr_args; i++)
- call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
- perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
- irq_flags, regs);
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
}
static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1425,6 +1635,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
}
#endif /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(event);
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_perf_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_perf_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
static __kprobes
int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,43 +1684,43 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
return 0; /* We don't tweek kernel, so just return 0 */
}
+static struct trace_event_functions kretprobe_funcs = {
+ .trace = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+ .trace = print_kprobe_event
+};
+
static int register_probe_event(struct trace_probe *tp)
{
struct ftrace_event_call *call = &tp->call;
int ret;
/* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
if (probe_is_return(tp)) {
- tp->event.trace = print_kretprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kretprobe_event_define_fields;
+ call->event.funcs = &kretprobe_funcs;
+ call->class->define_fields = kretprobe_event_define_fields;
} else {
- tp->event.trace = print_kprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kprobe_event_define_fields;
+ call->event.funcs = &kprobe_funcs;
+ call->class->define_fields = kprobe_event_define_fields;
}
if (set_print_fmt(tp) < 0)
return -ENOMEM;
- call->event = &tp->event;
- call->id = register_ftrace_event(&tp->event);
- if (!call->id) {
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
}
- call->enabled = 0;
- call->regfunc = probe_event_enable;
- call->unregfunc = probe_event_disable;
-
-#ifdef CONFIG_PERF_EVENTS
- call->perf_event_enable = probe_perf_enable;
- call->perf_event_disable = probe_perf_disable;
-#endif
+ call->flags = 0;
+ call->class->reg = kprobe_register;
call->data = tp;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n", call->name);
kfree(call->print_fmt);
- unregister_ftrace_event(&tp->event);
+ unregister_ftrace_event(&call->event);
}
return ret;
}
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * trace_ksym.c - Kernel Symbol Tracer
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2009
- */
-
-#include <linux/kallsyms.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-#include <linux/hw_breakpoint.h>
-#include <asm/hw_breakpoint.h>
-
-#include <asm/atomic.h>
-
-#define KSYM_TRACER_OP_LEN 3 /* rw- */
-
-struct trace_ksym {
- struct perf_event **ksym_hbp;
- struct perf_event_attr attr;
-#ifdef CONFIG_PROFILE_KSYM_TRACER
- atomic64_t counter;
-#endif
- struct hlist_node ksym_hlist;
-};
-
-static struct trace_array *ksym_trace_array;
-
-static unsigned int ksym_tracing_enabled;
-
-static HLIST_HEAD(ksym_filter_head);
-
-static DEFINE_MUTEX(ksym_tracer_mutex);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-
-#define MAX_UL_INT 0xffffffff
-
-void ksym_collect_stats(unsigned long hbp_hit_addr)
-{
- struct hlist_node *node;
- struct trace_ksym *entry;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
- if (entry->attr.bp_addr == hbp_hit_addr) {
- atomic64_inc(&entry->counter);
- break;
- }
- }
- rcu_read_unlock();
-}
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-void ksym_hbp_handler(struct perf_event *hbp, int nmi,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct ring_buffer_event *event;
- struct ksym_trace_entry *entry;
- struct ring_buffer *buffer;
- int pc;
-
- if (!ksym_tracing_enabled)
- return;
-
- buffer = ksym_trace_array->buffer;
-
- pc = preempt_count();
-
- event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
- sizeof(*entry), 0, pc);
- if (!event)
- return;
-
- entry = ring_buffer_event_data(event);
- entry->ip = instruction_pointer(regs);
- entry->type = hw_breakpoint_type(hbp);
- entry->addr = hw_breakpoint_addr(hbp);
- strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
- ksym_collect_stats(hw_breakpoint_addr(hbp));
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
- trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-/* Valid access types are represented as
- *
- * rw- : Set Read/Write Access Breakpoint
- * -w- : Set Write Access Breakpoint
- * --- : Clear Breakpoints
- * --x : Set Execution Break points (Not available yet)
- *
- */
-static int ksym_trace_get_access_type(char *str)
-{
- int access = 0;
-
- if (str[0] == 'r')
- access |= HW_BREAKPOINT_R;
-
- if (str[1] == 'w')
- access |= HW_BREAKPOINT_W;
-
- if (str[2] == 'x')
- access |= HW_BREAKPOINT_X;
-
- switch (access) {
- case HW_BREAKPOINT_R:
- case HW_BREAKPOINT_W:
- case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- return access;
- default:
- return -EINVAL;
- }
-}
-
-/*
- * There can be several possible malformed requests and we attempt to capture
- * all of them. We enumerate some of the rules
- * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
- * i.e. multiple ':' symbols disallowed. Possible uses are of the form
- * <module>:<ksym_name>:<op>.
- * 2. No delimiter symbol ':' in the input string
- * 3. Spurious operator symbols or symbols not in their respective positions
- * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
- * 5. Kernel symbol not a part of /proc/kallsyms
- * 6. Duplicate requests
- */
-static int parse_ksym_trace_str(char *input_string, char **ksymname,
- unsigned long *addr)
-{
- int ret;
-
- *ksymname = strsep(&input_string, ":");
- *addr = kallsyms_lookup_name(*ksymname);
-
- /* Check for malformed request: (2), (1) and (5) */
- if ((!input_string) ||
- (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
- (*addr == 0))
- return -EINVAL;;
-
- ret = ksym_trace_get_access_type(input_string);
-
- return ret;
-}
-
-int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
-{
- struct trace_ksym *entry;
- int ret = -ENOMEM;
-
- entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
- if (!entry)
- return -ENOMEM;
-
- hw_breakpoint_init(&entry->attr);
-
- entry->attr.bp_type = op;
- entry->attr.bp_addr = addr;
- entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
-
- entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
- ksym_hbp_handler);
-
- if (IS_ERR(entry->ksym_hbp)) {
- ret = PTR_ERR(entry->ksym_hbp);
- if (ret == -ENOSPC) {
- printk(KERN_ERR "ksym_tracer: Maximum limit reached."
- " No new requests for tracing can be accepted now.\n");
- } else {
- printk(KERN_INFO "ksym_tracer request failed. Try again"
- " later!!\n");
- }
- goto err;
- }
-
- hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-
- return 0;
-
-err:
- kfree(entry);
-
- return ret;
-}
-
-static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos)
-{
- struct trace_ksym *entry;
- struct hlist_node *node;
- struct trace_seq *s;
- ssize_t cnt = 0;
- int ret;
-
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
- trace_seq_init(s);
-
- mutex_lock(&ksym_tracer_mutex);
-
- hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
- ret = trace_seq_printf(s, "%pS:",
- (void *)(unsigned long)entry->attr.bp_addr);
- if (entry->attr.bp_type == HW_BREAKPOINT_R)
- ret = trace_seq_puts(s, "r--\n");
- else if (entry->attr.bp_type == HW_BREAKPOINT_W)
- ret = trace_seq_puts(s, "-w-\n");
- else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
- ret = trace_seq_puts(s, "rw-\n");
- WARN_ON_ONCE(!ret);
- }
-
- cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
-
- mutex_unlock(&ksym_tracer_mutex);
-
- kfree(s);
-
- return cnt;
-}
-
-static void __ksym_trace_reset(void)
-{
- struct trace_ksym *entry;
- struct hlist_node *node, *node1;
-
- mutex_lock(&ksym_tracer_mutex);
- hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
- ksym_hlist) {
- unregister_wide_hw_breakpoint(entry->ksym_hbp);
- hlist_del_rcu(&(entry->ksym_hlist));
- synchronize_rcu();
- kfree(entry);
- }
- mutex_unlock(&ksym_tracer_mutex);
-}
-
-static ssize_t ksym_trace_filter_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct trace_ksym *entry;
- struct hlist_node *node;
- char *buf, *input_string, *ksymname = NULL;
- unsigned long ksym_addr = 0;
- int ret, op, changed = 0;
-
- buf = kzalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(buf, buffer, count))
- goto out;
-
- buf[count] = '\0';
- input_string = strstrip(buf);
-
- /*
- * Clear all breakpoints if:
- * 1: echo > ksym_trace_filter
- * 2: echo 0 > ksym_trace_filter
- * 3: echo "*:---" > ksym_trace_filter
- */
- if (!input_string[0] || !strcmp(input_string, "0") ||
- !strcmp(input_string, "*:---")) {
- __ksym_trace_reset();
- ret = 0;
- goto out;
- }
-
- ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
- if (ret < 0)
- goto out;
-
- mutex_lock(&ksym_tracer_mutex);
-
- ret = -EINVAL;
- hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
- if (entry->attr.bp_addr == ksym_addr) {
- /* Check for malformed request: (6) */
- if (entry->attr.bp_type != op)
- changed = 1;
- else
- goto out_unlock;
- break;
- }
- }
- if (changed) {
- unregister_wide_hw_breakpoint(entry->ksym_hbp);
- entry->attr.bp_type = op;
- ret = 0;
- if (op > 0) {
- entry->ksym_hbp =
- register_wide_hw_breakpoint(&entry->attr,
- ksym_hbp_handler);
- if (IS_ERR(entry->ksym_hbp))
- ret = PTR_ERR(entry->ksym_hbp);
- else
- goto out_unlock;
- }
- /* Error or "symbol:---" case: drop it */
- hlist_del_rcu(&(entry->ksym_hlist));
- synchronize_rcu();
- kfree(entry);
- goto out_unlock;
- } else {
- /* Check for malformed request: (4) */
- if (op)
- ret = process_new_ksym_entry(ksymname, op, ksym_addr);
- }
-out_unlock:
- mutex_unlock(&ksym_tracer_mutex);
-out:
- kfree(buf);
- return !ret ? count : ret;
-}
-
-static const struct file_operations ksym_tracing_fops = {
- .open = tracing_open_generic,
- .read = ksym_trace_filter_read,
- .write = ksym_trace_filter_write,
-};
-
-static void ksym_trace_reset(struct trace_array *tr)
-{
- ksym_tracing_enabled = 0;
- __ksym_trace_reset();
-}
-
-static int ksym_trace_init(struct trace_array *tr)
-{
- int cpu, ret = 0;
-
- for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
- ksym_tracing_enabled = 1;
- ksym_trace_array = tr;
-
- return ret;
-}
-
-static void ksym_trace_print_header(struct seq_file *m)
-{
- seq_puts(m,
- "# TASK-PID CPU# Symbol "
- "Type Function\n");
- seq_puts(m,
- "# | | | "
- " | |\n");
-}
-
-static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
- struct trace_seq *s = &iter->seq;
- struct ksym_trace_entry *field;
- char str[KSYM_SYMBOL_LEN];
- int ret;
-
- if (entry->type != TRACE_KSYM)
- return TRACE_TYPE_UNHANDLED;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
- entry->pid, iter->cpu, (char *)field->addr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- switch (field->type) {
- case HW_BREAKPOINT_R:
- ret = trace_seq_printf(s, " R ");
- break;
- case HW_BREAKPOINT_W:
- ret = trace_seq_printf(s, " W ");
- break;
- case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
- ret = trace_seq_printf(s, " RW ");
- break;
- default:
- return TRACE_TYPE_PARTIAL_LINE;
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- sprint_symbol(str, field->ip);
- ret = trace_seq_printf(s, "%s\n", str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-struct tracer ksym_tracer __read_mostly =
-{
- .name = "ksym_tracer",
- .init = ksym_trace_init,
- .reset = ksym_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_ksym,
-#endif
- .print_header = ksym_trace_print_header,
- .print_line = ksym_trace_output
-};
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-static int ksym_profile_show(struct seq_file *m, void *v)
-{
- struct hlist_node *node;
- struct trace_ksym *entry;
- int access_type = 0;
- char fn_name[KSYM_NAME_LEN];
-
- seq_puts(m, " Access Type ");
- seq_puts(m, " Symbol Counter\n");
- seq_puts(m, " ----------- ");
- seq_puts(m, " ------ -------\n");
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-
- access_type = entry->attr.bp_type;
-
- switch (access_type) {
- case HW_BREAKPOINT_R:
- seq_puts(m, " R ");
- break;
- case HW_BREAKPOINT_W:
- seq_puts(m, " W ");
- break;
- case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
- seq_puts(m, " RW ");
- break;
- default:
- seq_puts(m, " NA ");
- }
-
- if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
- seq_printf(m, " %-36s", fn_name);
- else
- seq_printf(m, " %-36s", "<NA>");
- seq_printf(m, " %15llu\n",
- (unsigned long long)atomic64_read(&entry->counter));
- }
- rcu_read_unlock();
-
- return 0;
-}
-
-static int ksym_profile_open(struct inode *node, struct file *file)
-{
- return single_open(file, ksym_profile_show, NULL);
-}
-
-static const struct file_operations ksym_profile_fops = {
- .open = ksym_profile_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-__init static int init_ksym_trace(void)
-{
- struct dentry *d_tracer;
-
- d_tracer = tracing_init_dentry();
-
- trace_create_file("ksym_trace_filter", 0644, d_tracer,
- NULL, &ksym_tracing_fops);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
- trace_create_file("ksym_profile", 0444, d_tracer,
- NULL, &ksym_profile_fops);
-#endif
-
- return register_tracer(&ksym_tracer);
-}
-device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ab13d7008061..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
DECLARE_RWSEM(trace_event_mutex);
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -742,6 +739,9 @@ int register_ftrace_event(struct trace_event *event)
if (WARN_ON(!event))
goto out;
+ if (WARN_ON(!event->funcs))
+ goto out;
+
INIT_LIST_HEAD(&event->list);
if (!event->type) {
@@ -774,14 +774,14 @@ int register_ftrace_event(struct trace_event *event)
goto out;
}
- if (event->trace == NULL)
- event->trace = trace_nop_print;
- if (event->raw == NULL)
- event->raw = trace_nop_print;
- if (event->hex == NULL)
- event->hex = trace_nop_print;
- if (event->binary == NULL)
- event->binary = trace_nop_print;
+ if (event->funcs->trace == NULL)
+ event->funcs->trace = trace_nop_print;
+ if (event->funcs->raw == NULL)
+ event->funcs->raw = trace_nop_print;
+ if (event->funcs->hex == NULL)
+ event->funcs->hex = trace_nop_print;
+ if (event->funcs->binary == NULL)
+ event->funcs->binary = trace_nop_print;
key = event->type & (EVENT_HASHSIZE - 1);
@@ -823,13 +823,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
* Standard events
*/
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return TRACE_TYPE_HANDLED;
}
/* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -856,7 +858,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
@@ -870,7 +873,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -883,7 +887,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -896,14 +901,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_fn_event = {
- .type = TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
.trace = trace_fn_trace,
.raw = trace_fn_raw,
.hex = trace_fn_hex,
.binary = trace_fn_bin,
};
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .funcs = &trace_fn_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -932,13 +941,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_print(iter, "==>");
}
static enum print_line_t trace_wake_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return trace_ctxwake_print(iter, " +");
}
@@ -966,12 +976,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, 0);
}
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, '+');
}
@@ -1000,18 +1012,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, 0);
}
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, '+');
}
static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct ctx_switch_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1028,81 +1042,34 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_ctx_event = {
- .type = TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
.trace = trace_ctx_print,
.raw = trace_ctx_raw,
.hex = trace_ctx_hex,
.binary = trace_ctxwake_bin,
};
-static struct trace_event trace_wake_event = {
- .type = TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .funcs = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
.trace = trace_wake_print,
.raw = trace_wake_raw,
.hex = trace_wake_hex,
.binary = trace_ctxwake_bin,
};
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
-
- trace_assign_type(field, iter->ent);
-
- if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
- struct trace_seq *s = &iter->seq;
-
- trace_assign_type(field, iter->ent);
-
- SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
- struct trace_seq *s = &iter->seq;
-
- trace_assign_type(field, iter->ent);
-
- SEQ_PUT_FIELD_RET(s, field->arg1);
- SEQ_PUT_FIELD_RET(s, field->arg2);
- SEQ_PUT_FIELD_RET(s, field->arg3);
-
- return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event trace_special_event = {
- .type = TRACE_SPECIAL,
- .trace = trace_special_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .funcs = &trace_wake_funcs,
};
/* TRACE_STACK */
static enum print_line_t trace_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct stack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1130,17 +1097,18 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_stack_funcs = {
+ .trace = trace_stack_print,
+};
+
static struct trace_event trace_stack_event = {
.type = TRACE_STACK,
- .trace = trace_stack_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+ .funcs = &trace_stack_funcs,
};
/* TRACE_USER_STACK */
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1159,17 +1127,19 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_user_stack_funcs = {
+ .trace = trace_user_stack_print,
+};
+
static struct trace_event trace_user_stack_event = {
.type = TRACE_USER_STACK,
- .trace = trace_user_stack_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+ .funcs = &trace_user_stack_funcs,
};
/* TRACE_BPRINT */
static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
@@ -1194,7 +1164,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct bprint_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1213,16 +1184,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_bprint_funcs = {
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
static struct trace_event trace_bprint_event = {
.type = TRACE_BPRINT,
- .trace = trace_bprint_print,
- .raw = trace_bprint_raw,
+ .funcs = &trace_bprint_funcs,
};
/* TRACE_PRINT */
static enum print_line_t trace_print_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1241,7 +1215,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct print_entry *field;
@@ -1256,18 +1231,21 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_print_event = {
- .type = TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
.trace = trace_print_print,
.raw = trace_print_raw,
};
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .funcs = &trace_print_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
&trace_ctx_event,
&trace_wake_event,
- &trace_special_event,
&trace_stack_event,
&trace_user_stack_event,
&trace_bprint_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
extern struct trace_event *ftrace_find_event(int type);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
- int flags);
+ int flags, struct trace_event *event);
extern int
trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a55fccfede5d..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,7 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
}
static void
-probe_sched_switch(struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -108,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
}
static void
-probe_sched_wakeup(struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -138,21 +138,21 @@ static int tracing_sched_register(void)
{
int ret;
- ret = register_trace_sched_wakeup(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return ret;
}
- ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_sched_switch);
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -161,17 +161,17 @@ static int tracing_sched_register(void)
return ret;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
return ret;
}
static void tracing_sched_unregister(void)
{
- unregister_trace_sched_switch(probe_sched_switch);
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8052446ceeaa..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
- int resched;
int cpu;
int pc;
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
return;
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
out:
atomic_dec(&data->disabled);
out_enable:
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
@@ -98,7 +97,8 @@ static int report_latency(cycle_t delta)
return 1;
}
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
{
if (task != wakeup_task)
return;
@@ -107,7 +107,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
}
static void notrace
-probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
@@ -199,7 +200,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
@@ -263,28 +264,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
- ret = register_trace_sched_wakeup(probe_wakeup);
+ ret = register_trace_sched_wakeup(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return;
}
- ret = register_trace_sched_wakeup_new(probe_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
- ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
@@ -311,19 +312,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
return;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
}
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
- unregister_trace_sched_switch(probe_wakeup_sched_switch);
- unregister_trace_sched_wakeup_new(probe_wakeup);
- unregister_trace_sched_wakeup(probe_wakeup);
- unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+ unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
static int __wakeup_tracer_init(struct trace_array *tr)
@@ -381,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .use_max_tr = 1,
};
static struct tracer wakeup_rt_tracer __read_mostly =
@@ -395,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .use_max_tr = 1,
};
__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_WAKE:
case TRACE_STACK:
case TRACE_PRINT:
- case TRACE_SPECIAL:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
- case TRACE_KSYM:
return 1;
}
return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
}
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-#ifdef CONFIG_SYSPROF_TRACER
-int
-trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /* Sleep for a 1/10 of a second */
- msleep(100);
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT ".. no entries found ..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_SYSPROF_TRACER */
-
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_BRANCH_TRACER */
-#ifdef CONFIG_KSYM_TRACER
-static int ksym_selftest_dummy;
-
-int
-trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- ksym_selftest_dummy = 0;
- /* Register the read-write tracing request */
-
- ret = process_new_ksym_entry("ksym_selftest_dummy",
- HW_BREAKPOINT_R | HW_BREAKPOINT_W,
- (unsigned long)(&ksym_selftest_dummy));
-
- if (ret < 0) {
- printk(KERN_CONT "ksym_trace read-write startup test failed\n");
- goto ret_path;
- }
- /* Perform a read and a write operation over the dummy variable to
- * trigger the tracer
- */
- if (ksym_selftest_dummy == 0)
- ksym_selftest_dummy++;
-
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
- tracing_start();
-
- /* read & write operations - one each is performed on the dummy variable
- * triggering two entries in the trace buffer
- */
- if (!ret && count != 2) {
- printk(KERN_CONT "Ksym tracer startup test failed");
- ret = -1;
- }
-
-ret_path:
- return ret;
-}
-#endif /* CONFIG_KSYM_TRACER */
-
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..056468eae7cf 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
- int cpu, resched;
+ int cpu;
if (unlikely(!ftrace_enabled || stack_trace_disabled))
return;
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
/* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
out:
per_cpu(trace_active, cpu)--;
/* prevent recursion in schedule */
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,55 @@ static int sys_refcount_exit;
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+ return &syscall_exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
+};
+
+struct ftrace_event_class event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
+};
+
+struct ftrace_event_class event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .get_fields = syscall_get_exit_fields,
+ .raw_init = init_syscall_trace,
+};
+
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];
@@ -53,7 +102,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
}
enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -68,7 +118,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
if (!entry)
goto end;
- if (entry->enter_event->id != ent->type) {
+ if (entry->enter_event->event.type != ent->type) {
WARN_ON_ONCE(1);
goto end;
}
@@ -105,7 +155,8 @@ end:
}
enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -123,7 +174,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
- if (entry->exit_event->id != ent->type) {
+ if (entry->exit_event->event.type != ent->type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
@@ -205,7 +256,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -228,7 +279,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -243,7 +294,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
@@ -265,7 +316,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->id, size, 0, 0);
+ sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -278,7 +329,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -297,7 +348,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
return;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->id, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -320,7 +371,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
- ret = register_trace_sys_enter(ftrace_syscall_enter);
+ ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
if (!ret) {
set_bit(num, enabled_enter_syscalls);
sys_refcount_enter++;
@@ -340,7 +391,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
sys_refcount_enter--;
clear_bit(num, enabled_enter_syscalls);
if (!sys_refcount_enter)
- unregister_trace_sys_enter(ftrace_syscall_enter);
+ unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -354,7 +405,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
- ret = register_trace_sys_exit(ftrace_syscall_exit);
+ ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
if (!ret) {
set_bit(num, enabled_exit_syscalls);
sys_refcount_exit++;
@@ -374,7 +425,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
sys_refcount_exit--;
clear_bit(num, enabled_exit_syscalls);
if (!sys_refcount_exit)
- unregister_trace_sys_exit(ftrace_syscall_exit);
+ unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -434,11 +485,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static void perf_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- unsigned long flags;
+ struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -461,14 +512,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
return;
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->id, &rctx, &flags);
+ sys_data->enter_event->event.type, regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -480,7 +533,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_enter)
- ret = register_trace_sys_enter(perf_syscall_enter);
+ ret = register_trace_sys_enter(perf_syscall_enter, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall entry trace point");
@@ -502,15 +555,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
- unregister_trace_sys_enter(perf_syscall_enter);
+ unregister_trace_sys_enter(perf_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
-static void perf_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- unsigned long flags;
+ struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -536,14 +589,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
return;
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->id, &rctx, &flags);
+ sys_data->exit_event->event.type, regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -555,7 +609,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_exit)
- ret = register_trace_sys_exit(perf_syscall_exit);
+ ret = register_trace_sys_exit(perf_syscall_exit, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall exit trace point");
@@ -577,9 +631,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
- unregister_trace_sys_exit(perf_syscall_exit);
+ unregister_trace_sys_exit(perf_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
#endif /* CONFIG_PERF_EVENTS */
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_enter(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_enter(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysenter_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysenter_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_exit(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_exit(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysexit_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysexit_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * trace stack traces
- *
- * Copyright (C) 2004-2008, Soeren Sandmann
- * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
- * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/hrtimer.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/fs.h>
-
-#include <asm/stacktrace.h>
-
-#include "trace.h"
-
-static struct trace_array *sysprof_trace;
-static int __read_mostly tracer_enabled;
-
-/*
- * 1 msec sample interval by default:
- */
-static unsigned long sample_period = 1000000;
-static const unsigned int sample_max_depth = 512;
-
-static DEFINE_MUTEX(sample_timer_lock);
-/*
- * Per CPU hrtimers that do the profiling:
- */
-static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
-
-struct stack_frame {
- const void __user *next_fp;
- unsigned long return_address;
-};
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
- int ret;
-
- if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
- return 0;
-
- ret = 1;
- pagefault_disable();
- if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
- ret = 0;
- pagefault_enable();
-
- return ret;
-}
-
-struct backtrace_info {
- struct trace_array_cpu *data;
- struct trace_array *tr;
- int pos;
-};
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
- /* Don't bother with IRQ stacks for now */
- return -1;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct backtrace_info *info = data;
-
- if (info->pos < sample_max_depth && reliable) {
- __trace_special(info->tr, info->data, 1, addr, 0);
-
- info->pos++;
- }
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack,
-};
-
-static int
-trace_kernel(struct pt_regs *regs, struct trace_array *tr,
- struct trace_array_cpu *data)
-{
- struct backtrace_info info;
- unsigned long bp;
- char *stack;
-
- info.tr = tr;
- info.data = data;
- info.pos = 1;
-
- __trace_special(info.tr, info.data, 1, regs->ip, 0);
-
- stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
- bp = regs->bp;
-#else
- bp = 0;
-#endif
-
- dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
-
- return info.pos;
-}
-
-static void timer_notify(struct pt_regs *regs, int cpu)
-{
- struct trace_array_cpu *data;
- struct stack_frame frame;
- struct trace_array *tr;
- const void __user *fp;
- int is_user;
- int i;
-
- if (!regs)
- return;
-
- tr = sysprof_trace;
- data = tr->data[cpu];
- is_user = user_mode(regs);
-
- if (!current || current->pid == 0)
- return;
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- __trace_special(tr, data, 0, 0, current->pid);
-
- if (!is_user)
- i = trace_kernel(regs, tr, data);
- else
- i = 0;
-
- /*
- * Trace user stack if we are not a kernel thread
- */
- if (current->mm && i < sample_max_depth) {
- regs = (struct pt_regs *)current->thread.sp0 - 1;
-
- fp = (void __user *)regs->bp;
-
- __trace_special(tr, data, 2, regs->ip, 0);
-
- while (i < sample_max_depth) {
- frame.next_fp = NULL;
- frame.return_address = 0;
- if (!copy_stack_frame(fp, &frame))
- break;
- if ((unsigned long)fp < regs->sp)
- break;
-
- __trace_special(tr, data, 2, frame.return_address,
- (unsigned long)fp);
- fp = frame.next_fp;
-
- i++;
- }
-
- }
-
- /*
- * Special trace entry if we overflow the max depth:
- */
- if (i == sample_max_depth)
- __trace_special(tr, data, -1, -1, -1);
-
- __trace_special(tr, data, 3, current->pid, i);
-}
-
-static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-{
- /* trace here */
- timer_notify(get_irq_regs(), smp_processor_id());
-
- hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
-
- return HRTIMER_RESTART;
-}
-
-static void start_stack_timer(void *unused)
-{
- struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
-
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = stack_trace_timer_fn;
-
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
-}
-
-static void start_stack_timers(void)
-{
- on_each_cpu(start_stack_timer, NULL, 1);
-}
-
-static void stop_stack_timer(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
-
- hrtimer_cancel(hrtimer);
-}
-
-static void stop_stack_timers(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- stop_stack_timer(cpu);
-}
-
-static void stop_stack_trace(struct trace_array *tr)
-{
- mutex_lock(&sample_timer_lock);
- stop_stack_timers();
- tracer_enabled = 0;
- mutex_unlock(&sample_timer_lock);
-}
-
-static int stack_trace_init(struct trace_array *tr)
-{
- sysprof_trace = tr;
-
- tracing_start_cmdline_record();
-
- mutex_lock(&sample_timer_lock);
- start_stack_timers();
- tracer_enabled = 1;
- mutex_unlock(&sample_timer_lock);
- return 0;
-}
-
-static void stack_trace_reset(struct trace_array *tr)
-{
- tracing_stop_cmdline_record();
- stop_stack_trace(tr);
-}
-
-static struct tracer stack_trace __read_mostly =
-{
- .name = "sysprof",
- .init = stack_trace_init,
- .reset = stack_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_sysprof,
-#endif
-};
-
-__init static int init_stack_trace(void)
-{
- return register_tracer(&stack_trace);
-}
-device_initcall(init_stack_trace);
-
-#define MAX_LONG_DIGITS 22
-
-static ssize_t
-sysprof_sample_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_LONG_DIGITS];
- int r;
-
- r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-sysprof_sample_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_LONG_DIGITS];
- unsigned long val;
-
- if (cnt > MAX_LONG_DIGITS-1)
- cnt = MAX_LONG_DIGITS-1;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- val = simple_strtoul(buf, NULL, 10);
- /*
- * Enforce a minimum sample period of 100 usecs:
- */
- if (val < 100)
- val = 100;
-
- mutex_lock(&sample_timer_lock);
- stop_stack_timers();
- sample_period = val * 1000;
- start_stack_timers();
- mutex_unlock(&sample_timer_lock);
-
- return cnt;
-}
-
-static const struct file_operations sysprof_sample_fops = {
- .read = sysprof_sample_read,
- .write = sysprof_sample_write,
-};
-
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
-{
-
- trace_create_file("sysprof_sample_period", 0644,
- d_tracer, NULL, &sysprof_sample_fops);
-}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
/* Insertion of a work */
static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
/* Execution of a work */
static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
}
/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+ struct task_struct *wq_thread, int cpu)
{
struct cpu_workqueue_stats *cws;
unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
}
/* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
{
/* Workqueue only execute on one cpu */
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
{
int ret, cpu;
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
if (ret)
goto out;
- ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
if (ret)
goto no_insertion;
- ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
if (ret)
goto no_execution;
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
if (ret)
goto no_creation;
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
return 0;
no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation);
+ unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution);
+ unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
out:
pr_warning("trace_workqueue: unable to trace workqueues\n");
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
*/
struct tracepoint_entry {
struct hlist_node hlist;
- void **funcs;
+ struct tracepoint_func *funcs;
int refcount; /* Number of times armed. 0 if disarmed. */
char name[0];
};
@@ -64,12 +64,12 @@ struct tp_probes {
struct rcu_head rcu;
struct list_head list;
} u;
- void *probes[0];
+ struct tracepoint_func probes[0];
};
static inline void *allocate_probes(int count)
{
- struct tp_probes *p = kmalloc(count * sizeof(void *)
+ struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
+ sizeof(struct tp_probes), GFP_KERNEL);
return p == NULL ? NULL : p->probes;
}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
kfree(container_of(head, struct tp_probes, u.rcu));
}
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
if (!tracepoint_debug || !entry->funcs)
return;
- for (i = 0; entry->funcs[i]; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+ for (i = 0; entry->funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
}
-static void *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+static struct tracepoint_func *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0;
- void **old, **new;
+ struct tracepoint_func *old, *new;
WARN_ON(!probe);
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
old = entry->funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++)
- if (old[nr_probes] == probe)
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe &&
+ old[nr_probes].data == data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old)
- memcpy(new, old, nr_probes * sizeof(void *));
- new[nr_probes] = probe;
- new[nr_probes + 1] = NULL;
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ new[nr_probes].func = probe;
+ new[nr_probes].data = data;
+ new[nr_probes + 1].func = NULL;
entry->refcount = nr_probes + 1;
entry->funcs = new;
debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
}
static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0, nr_del = 0, i;
- void **old, **new;
+ struct tracepoint_func *old, *new;
old = entry->funcs;
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++) {
- if ((!probe || old[nr_probes] == probe))
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (!probe ||
+ (old[nr_probes].func == probe &&
+ old[nr_probes].data == data))
nr_del++;
}
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
new = allocate_probes(nr_probes - nr_del + 1);
if (new == NULL)
return ERR_PTR(-ENOMEM);
- for (i = 0; old[i]; i++)
- if ((probe && old[i] != probe))
+ for (i = 0; old[i].func; i++)
+ if (probe &&
+ (old[i].func != probe || old[i].data != data))
new[j++] = old[i];
- new[nr_probes - nr_del] = NULL;
+ new[nr_probes - nr_del].func = NULL;
entry->refcount = nr_probes - nr_del;
entry->funcs = new;
}
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
module_update_tracepoints();
}
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry) {
entry = add_tracepoint(name);
if (IS_ERR(entry))
- return entry;
+ return (struct tracepoint_func *)entry;
}
- old = tracepoint_entry_add_probe(entry, probe);
+ old = tracepoint_entry_add_probe(entry, probe, data);
if (IS_ERR(old) && !entry->refcount)
remove_tracepoint(entry);
return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
* Returns 0 if ok, error value on error.
* The probe address must at least be aligned on the architecture pointer size.
*/
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry)
return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe);
+ old = tracepoint_entry_remove_probe(entry, probe, data);
if (IS_ERR(old))
return old;
if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
* itself uses stop_machine(), which insures that every preempt disabled section
* have finished.
*/
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/highuid.h>
#include <linux/cred.h>
/*
@@ -54,8 +55,8 @@ int create_user_ns(struct cred *new)
#endif
/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
- /* alloc_uid() incremented the userns refcount. Just set it to 1 */
- kref_set(&ns->kref, 1);
+ /* root_user holds a reference to ns, our reference can be dropped */
+ put_user_ns(ns);
return 0;
}
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
schedule_work(&ns->destroyer);
}
EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return uid;
+
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (uid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return gid;
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (gid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowgid;
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..613bc1f04610
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,567 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+
+static int __read_mostly did_panic;
+static int __initdata no_watchdog;
+
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int hardlockup_panic;
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+ no_watchdog = 1;
+ return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+ no_watchdog = 1;
+ return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/* */
+
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static unsigned long get_sample_period(void)
+{
+ /*
+ * convert softlockup_thresh from seconds to ns
+ * the divide by 5 is to give hrtimer 5 chances to
+ * increment before the hardlockup detector generates
+ * a warning
+ */
+ return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+ int this_cpu = smp_processor_id();
+
+ __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+
+void touch_softlockup_watchdog(void)
+{
+ __get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /*
+ * this is done lockless
+ * do we care if a 0 races with a timestamp?
+ * all it means is the softlock check starts one cycle later
+ */
+ for_each_online_cpu(cpu)
+ per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void touch_nmi_watchdog(void)
+{
+ __get_cpu_var(watchdog_nmi_touch) = true;
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+#endif
+
+void touch_softlockup_watchdog_sync(void)
+{
+ __raw_get_cpu_var(softlockup_touch_sync) = true;
+ __raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* watchdog detector functions */
+static int is_hardlockup(void)
+{
+ unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+
+ if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+ return 1;
+
+ __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+ return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts)
+{
+ unsigned long now = get_timestamp(smp_processor_id());
+
+ /* Warn about unreasonable delays: */
+ if (time_after(now, touch_ts + softlockup_thresh))
+ return now - touch_ts;
+
+ return 0;
+}
+
+static int
+watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = watchdog_panic,
+};
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+void watchdog_overflow_callback(struct perf_event *event, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (__get_cpu_var(watchdog_nmi_touch) == true) {
+ __get_cpu_var(watchdog_nmi_touch) = false;
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__get_cpu_var(hard_watchdog_warn) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+
+ __get_cpu_var(hard_watchdog_warn) = true;
+ return;
+ }
+
+ __get_cpu_var(hard_watchdog_warn) = false;
+ return;
+}
+static void watchdog_interrupt_count(void)
+{
+ __get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+ unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+ struct pt_regs *regs = get_irq_regs();
+ int duration;
+
+ /* kick the hardlockup detector */
+ watchdog_interrupt_count();
+
+ /* kick the softlockup detector */
+ wake_up_process(__get_cpu_var(softlockup_watchdog));
+
+ /* .. and repeat */
+ hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+
+ if (touch_ts == 0) {
+ if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+ /*
+ * If the time stamp was touched atomically
+ * make sure the scheduler tick is up to date.
+ */
+ __get_cpu_var(softlockup_touch_sync) = false;
+ sched_clock_tick();
+ }
+ __touch_watchdog();
+ return HRTIMER_RESTART;
+ }
+
+ /* check for a softlockup
+ * This is done by making sure a high priority task is
+ * being scheduled. The task touches the watchdog to
+ * indicate it is getting cpu time. If it hasn't then
+ * this is a good indication some task is hogging the cpu
+ */
+ duration = is_softlockup(touch_ts);
+ if (unlikely(duration)) {
+ /* only warn once */
+ if (__get_cpu_var(soft_watchdog_warn) == true)
+ return HRTIMER_RESTART;
+
+ printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ smp_processor_id(), duration,
+ current->comm, task_pid_nr(current));
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ if (softlockup_panic)
+ panic("softlockup: hung tasks");
+ __get_cpu_var(soft_watchdog_warn) = true;
+ } else
+ __get_cpu_var(soft_watchdog_warn) = false;
+
+ return HRTIMER_RESTART;
+}
+
+
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *unused)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+
+ /* initialize timestamp */
+ __touch_watchdog();
+
+ /* kick off the timer for the hardlockup detector */
+ /* done here because hrtimer_start can only pin to smp_processor_id() */
+ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+ HRTIMER_MODE_REL_PINNED);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Run briefly once per second to reset the softlockup timestamp.
+ * If this gets delayed for more than 60 seconds then the
+ * debug-printout triggers in watchdog_timer_fn().
+ */
+ while (!kthread_should_stop()) {
+ __touch_watchdog();
+ schedule();
+
+ if (kthread_should_stop())
+ break;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int watchdog_nmi_enable(int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ /* Try to register using hardware perf events */
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period();
+ event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+ if (!IS_ERR(event)) {
+ printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+ goto out_save;
+ }
+
+ printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+ return -1;
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+static void watchdog_nmi_disable(int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+ struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+ WARN_ON(per_cpu(softlockup_watchdog, cpu));
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
+ return 0;
+}
+
+static int watchdog_enable(int cpu)
+{
+ struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+
+ /* enable the perf event */
+ if (watchdog_nmi_enable(cpu) != 0)
+ return -1;
+
+ /* create the watchdog thread */
+ if (!p) {
+ p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+ return -1;
+ }
+ kthread_bind(p, cpu);
+ per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(softlockup_watchdog, cpu) = p;
+ wake_up_process(p);
+ }
+
+ return 0;
+}
+
+static void watchdog_disable(int cpu)
+{
+ struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+ struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+ /*
+ * cancel the timer first to stop incrementing the stats
+ * and waking up the kthread
+ */
+ hrtimer_cancel(hrtimer);
+
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
+
+ /* stop the watchdog thread */
+ if (p) {
+ per_cpu(softlockup_watchdog, cpu) = NULL;
+ kthread_stop(p);
+ }
+
+ /* if any cpu succeeds, watchdog is considered enabled for the system */
+ watchdog_enabled = 1;
+}
+
+static void watchdog_enable_all_cpus(void)
+{
+ int cpu;
+ int result = 0;
+
+ for_each_online_cpu(cpu)
+ result += watchdog_enable(cpu);
+
+ if (result)
+ printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ watchdog_disable(cpu);
+
+ /* if all watchdogs are disabled, then they are disabled for the system */
+ watchdog_enabled = 0;
+}
+
+
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, buffer, length, ppos);
+
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+ return 0;
+}
+
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif /* CONFIG_SYSCTL */
+
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (watchdog_prepare_cpu(hotcpu))
+ return NOTIFY_BAD;
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ if (watchdog_enable(hotcpu))
+ return NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ watchdog_disable(hotcpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ watchdog_disable(hotcpu);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+static int __init spawn_watchdog_task(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err;
+
+ if (no_watchdog)
+ return 0;
+
+ err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ WARN_ON(err == NOTIFY_BAD);
+
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
+
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 77dabbf64b8f..2994a0e3a61c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,41 +33,287 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/workqueue.h>
+#include <linux/idr.h>
+
+#include "workqueue_sched.h"
+
+enum {
+ /* global_cwq flags */
+ GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
+ GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
+ GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
+ GCWQ_FREEZING = 1 << 3, /* freeze in progress */
+ GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
+
+ /* worker flags */
+ WORKER_STARTED = 1 << 0, /* started */
+ WORKER_DIE = 1 << 1, /* die die die */
+ WORKER_IDLE = 1 << 2, /* is idle */
+ WORKER_PREP = 1 << 3, /* preparing to run works */
+ WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
+ WORKER_REBIND = 1 << 5, /* mom is home, come back */
+ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
+ WORKER_UNBOUND = 1 << 7, /* worker is unbound */
+
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+ WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+
+ /* gcwq->trustee_state */
+ TRUSTEE_START = 0, /* start */
+ TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
+ TRUSTEE_BUTCHER = 2, /* butcher workers */
+ TRUSTEE_RELEASE = 3, /* release workers */
+ TRUSTEE_DONE = 4, /* trustee is done */
+
+ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
+ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
+ BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
+
+ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
+ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
+
+ MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
+ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
+ CREATE_COOLDOWN = HZ, /* time to breath after fail */
+ TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
+
+ /*
+ * Rescue workers are used only on emergencies and shared by
+ * all cpus. Give -20.
+ */
+ RESCUER_NICE_LEVEL = -20,
+};
/*
- * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Set during initialization and read-only afterwards.
+ *
+ * P: Preemption protected. Disabling preemption is enough and should
+ * only be modified and accessed from the local cpu.
+ *
+ * L: gcwq->lock protected. Access with gcwq->lock held.
+ *
+ * X: During normal operation, modification requires gcwq->lock and
+ * should be done only from local cpu. Either disabling preemption
+ * on local cpu or grabbing gcwq->lock is enough for read access.
+ * If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *
+ * F: wq->flush_mutex protected.
+ *
+ * W: workqueue_lock protected.
*/
-struct cpu_workqueue_struct {
- spinlock_t lock;
+struct global_cwq;
- struct list_head worklist;
- wait_queue_head_t more_work;
- struct work_struct *current_work;
+/*
+ * The poor guys doing the actual heavy lifting. All on-duty workers
+ * are either serving the manager role, on idle list or on busy hash.
+ */
+struct worker {
+ /* on idle list while idle, on busy hash table while busy */
+ union {
+ struct list_head entry; /* L: while idle */
+ struct hlist_node hentry; /* L: while busy */
+ };
- struct workqueue_struct *wq;
- struct task_struct *thread;
-} ____cacheline_aligned;
+ struct work_struct *current_work; /* L: work being processed */
+ struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
+ struct list_head scheduled; /* L: scheduled works */
+ struct task_struct *task; /* I: worker task */
+ struct global_cwq *gcwq; /* I: the associated gcwq */
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+ unsigned long last_active; /* L: last active timestamp */
+ unsigned int flags; /* X: flags */
+ int id; /* I: worker id */
+ struct work_struct rebind_work; /* L: rebind worker to cpu */
+};
+
+/*
+ * Global per-cpu workqueue. There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
+ */
+struct global_cwq {
+ spinlock_t lock; /* the gcwq lock */
+ struct list_head worklist; /* L: list of pending works */
+ unsigned int cpu; /* I: the associated cpu */
+ unsigned int flags; /* L: GCWQ_* flags */
+
+ int nr_workers; /* L: total number of workers */
+ int nr_idle; /* L: currently idle ones */
+
+ /* workers are chained either in the idle_list or busy_hash */
+ struct list_head idle_list; /* X: list of idle workers */
+ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
+ /* L: hash of busy workers */
+
+ struct timer_list idle_timer; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for dworkers */
+
+ struct ida worker_ida; /* L: for worker IDs */
+
+ struct task_struct *trustee; /* L: for gcwq shutdown */
+ unsigned int trustee_state; /* L: trustee state */
+ wait_queue_head_t trustee_wait; /* trustee wait */
+ struct worker *first_idle; /* L: first idle worker */
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
+ */
+struct cpu_workqueue_struct {
+ struct global_cwq *gcwq; /* I: the associated gcwq */
+ struct workqueue_struct *wq; /* I: the owning workqueue */
+ int work_color; /* L: current color */
+ int flush_color; /* L: flushing color */
+ int nr_in_flight[WORK_NR_COLORS];
+ /* L: nr of in_flight works */
+ int nr_active; /* L: nr of active works */
+ int max_active; /* L: max active works */
+ struct list_head delayed_works; /* L: delayed works */
+};
+
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+ struct list_head list; /* F: list of flushers */
+ int flush_color; /* F: flush color waiting for */
+ struct completion done; /* flush completion */
+};
+
+/*
+ * All cpumasks are assumed to be always set on UP and thus can't be
+ * used to determine whether there's something to be done.
+ */
+#ifdef CONFIG_SMP
+typedef cpumask_var_t mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask) \
+ cpumask_test_and_set_cpu((cpu), (mask))
+#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
+#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
+#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp))
+#define free_mayday_mask(mask) free_cpumask_var((mask))
+#else
+typedef unsigned long mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
+#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
+#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
+#define alloc_mayday_mask(maskp, gfp) true
+#define free_mayday_mask(mask) do { } while (0)
+#endif
/*
* The externally visible workqueue abstraction is an array of
* per-CPU workqueues:
*/
struct workqueue_struct {
- struct cpu_workqueue_struct *cpu_wq;
- struct list_head list;
- const char *name;
- int singlethread;
- int freezeable; /* Freeze threads during suspend */
- int rt;
+ unsigned int flags; /* I: WQ_* flags */
+ union {
+ struct cpu_workqueue_struct __percpu *pcpu;
+ struct cpu_workqueue_struct *single;
+ unsigned long v;
+ } cpu_wq; /* I: cwq's */
+ struct list_head list; /* W: list of all workqueues */
+
+ struct mutex flush_mutex; /* protects wq flushing */
+ int work_color; /* F: current work color */
+ int flush_color; /* F: current flush color */
+ atomic_t nr_cwqs_to_flush; /* flush in progress */
+ struct wq_flusher *first_flusher; /* F: first flusher */
+ struct list_head flusher_queue; /* F: flush waiters */
+ struct list_head flusher_overflow; /* F: flush overflow list */
+
+ mayday_mask_t mayday_mask; /* cpus requesting rescue */
+ struct worker *rescuer; /* I: rescue worker */
+
+ int saved_max_active; /* W: saved cwq max_active */
+ const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
- struct lockdep_map lockdep_map;
+ struct lockdep_map lockdep_map;
#endif
};
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
+
+#define for_each_busy_worker(worker, i, pos, gcwq) \
+ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
+ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+ unsigned int sw)
+{
+ if (cpu < nr_cpu_ids) {
+ if (sw & 1) {
+ cpu = cpumask_next(cpu, mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ if (sw & 2)
+ return WORK_CPU_UNBOUND;
+ }
+ return WORK_CPU_NONE;
+}
+
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+ struct workqueue_struct *wq)
+{
+ return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+
+/*
+ * CPU iterators
+ *
+ * An extra gcwq is defined for an invalid cpu number
+ * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
+ * specific CPU. The following iterators are similar to
+ * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ *
+ * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
+ * for_each_cwq_cpu() : possible CPUs for bound workqueues,
+ * WORK_CPU_UNBOUND for unbound workqueues
+ */
+#define for_each_gcwq_cpu(cpu) \
+ for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+
+#define for_each_online_gcwq_cpu(cpu) \
+ for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+
+#define for_each_cwq_cpu(cpu, wq) \
+ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+
+#ifdef CONFIG_LOCKDEP
+/**
+ * in_workqueue_context() - in context of specified workqueue?
+ * @wq: the workqueue of interest
+ *
+ * Checks lockdep state to see if the current task is executing from
+ * within a workqueue item. This function exists only if lockdep is
+ * enabled.
+ */
+int in_workqueue_context(struct workqueue_struct *wq)
+{
+ return lock_is_held(&wq->lockdep_map);
+}
+#endif
+
#ifdef CONFIG_DEBUG_OBJECTS_WORK
static struct debug_obj_descr work_debug_descr;
@@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
* statically initialized. We just make sure that it
* is tracked in the object tracker.
*/
- if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+ if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
debug_object_init(work, &work_debug_descr);
debug_object_activate(work, &work_debug_descr);
return 0;
@@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);
+static bool workqueue_freezing; /* W: have wqs started freezing? */
+
+/*
+ * The almighty global cpu workqueues. nr_running is the only field
+ * which is expected to be used frequently by other cpus via
+ * try_to_wake_up(). Put it in a separate cacheline.
+ */
+static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
-static int singlethread_cpu __read_mostly;
-static const struct cpumask *cpu_singlethread_map __read_mostly;
/*
- * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
- * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
- * which comes in between can't use for_each_online_cpu(). We could
- * use cpu_possible_map, the cpumask below is more a documentation
- * than optimization.
+ * Global cpu workqueue and nr_running counter for unbound gcwq. The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
*/
-static cpumask_var_t cpu_populated_map __read_mostly;
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
+
+static int worker_thread(void *__worker);
+
+static struct global_cwq *get_gcwq(unsigned int cpu)
+{
+ if (cpu != WORK_CPU_UNBOUND)
+ return &per_cpu(global_cwq, cpu);
+ else
+ return &unbound_global_cwq;
+}
+
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+{
+ if (cpu != WORK_CPU_UNBOUND)
+ return &per_cpu(gcwq_nr_running, cpu);
+ else
+ return &unbound_gcwq_nr_running;
+}
-/* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+ struct workqueue_struct *wq)
{
- return wq->singlethread;
+ if (!(wq->flags & WQ_UNBOUND)) {
+ if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+ return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#else
+ return wq->cpu_wq.single;
+#endif
+ }
+ } else if (likely(cpu == WORK_CPU_UNBOUND))
+ return wq->cpu_wq.single;
+ return NULL;
}
-static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
+static unsigned int work_color_to_flags(int color)
{
- return is_wq_single_threaded(wq)
- ? cpu_singlethread_map : cpu_populated_map;
+ return color << WORK_STRUCT_COLOR_SHIFT;
}
-static
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+static int get_work_color(struct work_struct *work)
{
- if (unlikely(is_wq_single_threaded(wq)))
- cpu = singlethread_cpu;
- return per_cpu_ptr(wq->cpu_wq, cpu);
+ return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+ ((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+ return (color + 1) % WORK_NR_COLORS;
}
/*
- * Set the workqueue on which a work item is to be run
- * - Must *only* be called if the pending flag is set
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data. These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work. gcwq is available once the work has been
+ * queued anywhere after initialization. cwq is available only from
+ * queueing until execution starts.
*/
-static inline void set_wq_data(struct work_struct *work,
- struct cpu_workqueue_struct *cwq)
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+ unsigned long flags)
{
- unsigned long new;
-
BUG_ON(!work_pending(work));
+ atomic_long_set(&work->data, data | flags | work_static(work));
+}
- new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
- new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
- atomic_long_set(&work->data, new);
+static void set_work_cwq(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq,
+ unsigned long extra_flags)
+{
+ set_work_data(work, (unsigned long)cwq,
+ WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+}
+
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+ set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+ set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_CWQ)
+ return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ else
+ return NULL;
+}
+
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+ unsigned int cpu;
+
+ if (data & WORK_STRUCT_CWQ)
+ return ((struct cpu_workqueue_struct *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+
+ cpu = data >> WORK_STRUCT_FLAG_BITS;
+ if (cpu == WORK_CPU_NONE)
+ return NULL;
+
+ BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+ return get_gcwq(cpu);
+}
+
+/*
+ * Policy functions. These define the policies on how the global
+ * worker pool is managed. Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+ return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+ gcwq->flags & GCWQ_HIGHPRI_PENDING;
}
/*
- * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
+ * Need to wake up a worker? Called from anything but currently
+ * running workers.
*/
-static inline void clear_wq_data(struct work_struct *work)
+static bool need_more_worker(struct global_cwq *gcwq)
{
- unsigned long flags = *work_data_bits(work) &
- (1UL << WORK_STRUCT_STATIC);
- atomic_long_set(&work->data, flags);
+ return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+}
+
+/* Can I start working? Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
+{
+ return gcwq->nr_idle;
+}
+
+/* Do I need to keep working? Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+ atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+ return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
}
-static inline
-struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+/* Do we need a new worker? Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
{
- return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
+ return need_more_worker(gcwq) && !may_start_working(gcwq);
}
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+ return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct global_cwq *gcwq)
+{
+ bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+ int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+ int nr_busy = gcwq->nr_workers - nr_idle;
+
+ return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
+}
+
+/*
+ * Wake up functions.
+ */
+
+/* Return the first worker. Safe with preemption disabled */
+static struct worker *first_worker(struct global_cwq *gcwq)
+{
+ if (unlikely(list_empty(&gcwq->idle_list)))
+ return NULL;
+
+ return list_first_entry(&gcwq->idle_list, struct worker, entry);
+}
+
+/**
+ * wake_up_worker - wake up an idle worker
+ * @gcwq: gcwq to wake worker for
+ *
+ * Wake up the first idle worker of @gcwq.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void wake_up_worker(struct global_cwq *gcwq)
+{
+ struct worker *worker = first_worker(gcwq);
+
+ if (likely(worker))
+ wake_up_process(worker->task);
+}
+
+/**
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+ struct worker *worker = kthread_data(task);
+
+ if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+ atomic_inc(get_gcwq_nr_running(cpu));
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep. Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+ unsigned int cpu)
+{
+ struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ atomic_t *nr_running = get_gcwq_nr_running(cpu);
+
+ if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+ return NULL;
+
+ /* this can only happen on the local cpu */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ /*
+ * The counterpart of the following dec_and_test, implied mb,
+ * worklist not empty test sequence is in insert_work().
+ * Please read comment there.
+ *
+ * NOT_RUNNING is clear. This means that trustee is not in
+ * charge and we're running on the local cpu w/ rq lock held
+ * and preemption disabled, which in turn means that none else
+ * could be manipulating idle_list, so dereferencing idle_list
+ * without gcwq lock is safe.
+ */
+ if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+ to_wakeup = first_worker(gcwq);
+ return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ * @wakeup: wakeup an idle worker if necessary
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly. If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags,
+ bool wakeup)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ /*
+ * If transitioning into NOT_RUNNING, adjust nr_running and
+ * wake up an idle worker as necessary if requested by
+ * @wakeup.
+ */
+ if ((flags & WORKER_NOT_RUNNING) &&
+ !(worker->flags & WORKER_NOT_RUNNING)) {
+ atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+ if (wakeup) {
+ if (atomic_dec_and_test(nr_running) &&
+ !list_empty(&gcwq->worklist))
+ wake_up_worker(gcwq);
+ } else
+ atomic_dec(nr_running);
+ }
+
+ worker->flags |= flags;
+}
+
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ unsigned int oflags = worker->flags;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ worker->flags &= ~flags;
+
+ /* if transitioning out of NOT_RUNNING, increment nr_running */
+ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+}
+
+/**
+ * busy_worker_head - return the busy hash head for a work
+ * @gcwq: gcwq of interest
+ * @work: work to be hashed
+ *
+ * Return hash head of @gcwq for @work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to the hash head.
+ */
+static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
+ struct work_struct *work)
+{
+ const int base_shift = ilog2(sizeof(struct work_struct));
+ unsigned long v = (unsigned long)work;
+
+ /* simple shift and fold hash, do we need something better? */
+ v >>= base_shift;
+ v += v >> BUSY_WORKER_HASH_ORDER;
+ v &= BUSY_WORKER_HASH_MASK;
+
+ return &gcwq->busy_hash[v];
+}
+
+/**
+ * __find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @bwh: hash head as returned by busy_worker_head()
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq. @bwh should be
+ * the hash head obtained by calling busy_worker_head() with the same
+ * work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+ struct hlist_head *bwh,
+ struct work_struct *work)
+{
+ struct worker *worker;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry(worker, tmp, bwh, hentry)
+ if (worker->current_work == work)
+ return worker;
+ return NULL;
+}
+
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq. This function is
+ * identical to __find_worker_executing_work() except that this
+ * function calculates @bwh itself.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
+ struct work_struct *work)
+{
+ return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
+ work);
+}
+
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work. If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue. This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+ struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *twork;
+
+ if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+ return &gcwq->worklist;
+
+ list_for_each_entry(twork, &gcwq->worklist, entry) {
+ struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+
+ if (!(tcwq->wq->flags & WQ_HIGHPRI))
+ break;
+ }
+
+ gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+ return &twork->entry;
+}
+
+/**
+ * insert_work - insert a work into gcwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work which belongs to @cwq into @gcwq after @head.
+ * @extra_flags is or'd to work_struct flags.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, struct list_head *head)
+ struct work_struct *work, struct list_head *head,
+ unsigned int extra_flags)
{
- trace_workqueue_insertion(cwq->thread, work);
+ struct global_cwq *gcwq = cwq->gcwq;
+
+ /* we own @work, set data and link */
+ set_work_cwq(work, cwq, extra_flags);
- set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
+
list_add_tail(&work->entry, head);
- wake_up(&cwq->more_work);
+
+ /*
+ * Ensure either worker_sched_deactivated() sees the above
+ * list_add_tail() or we see zero nr_running to avoid workers
+ * lying around lazily while there are works to be processed.
+ */
+ smp_mb();
+
+ if (__need_more_worker(gcwq))
+ wake_up_worker(gcwq);
}
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
+ struct global_cwq *gcwq;
+ struct cpu_workqueue_struct *cwq;
+ struct list_head *worklist;
unsigned long flags;
debug_work_activate(work);
- spin_lock_irqsave(&cwq->lock, flags);
- insert_work(cwq, work, &cwq->worklist);
- spin_unlock_irqrestore(&cwq->lock, flags);
+
+ /* determine gcwq to use */
+ if (!(wq->flags & WQ_UNBOUND)) {
+ struct global_cwq *last_gcwq;
+
+ if (unlikely(cpu == WORK_CPU_UNBOUND))
+ cpu = raw_smp_processor_id();
+
+ /*
+ * It's multi cpu. If @wq is non-reentrant and @work
+ * was previously on a different cpu, it might still
+ * be running there, in which case the work needs to
+ * be queued on that cpu to guarantee non-reentrance.
+ */
+ gcwq = get_gcwq(cpu);
+ if (wq->flags & WQ_NON_REENTRANT &&
+ (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+ struct worker *worker;
+
+ spin_lock_irqsave(&last_gcwq->lock, flags);
+
+ worker = find_worker_executing_work(last_gcwq, work);
+
+ if (worker && worker->current_cwq->wq == wq)
+ gcwq = last_gcwq;
+ else {
+ /* meh... not running there, queue here */
+ spin_unlock_irqrestore(&last_gcwq->lock, flags);
+ spin_lock_irqsave(&gcwq->lock, flags);
+ }
+ } else
+ spin_lock_irqsave(&gcwq->lock, flags);
+ } else {
+ gcwq = get_gcwq(WORK_CPU_UNBOUND);
+ spin_lock_irqsave(&gcwq->lock, flags);
+ }
+
+ /* gcwq determined, get cwq and queue */
+ cwq = get_cwq(gcwq->cpu, wq);
+
+ BUG_ON(!list_empty(&work->entry));
+
+ cwq->nr_in_flight[cwq->work_color]++;
+
+ if (likely(cwq->nr_active < cwq->max_active)) {
+ cwq->nr_active++;
+ worklist = gcwq_determine_ins_pos(gcwq, cwq);
+ } else
+ worklist = &cwq->delayed_works;
+
+ insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
+
+ spin_unlock_irqrestore(&gcwq->lock, flags);
}
/**
@@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(!list_empty(&work->entry));
- __queue_work(wq_per_cpu(wq, cpu), work);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_work(cpu, wq, work);
ret = 1;
}
return ret;
@@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
static void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
- struct workqueue_struct *wq = cwq->wq;
+ struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
- __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+ __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
}
/**
@@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ unsigned int lcpu;
+
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
timer_stats_timer_set_start_info(&dwork->timer);
- /* This stores cwq for the moment, for the timer_fn */
- set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+ /*
+ * This stores cwq for the moment, for the timer_fn.
+ * Note that the work's gcwq is preserved to allow
+ * reentrance detection for delayed works.
+ */
+ if (!(wq->flags & WQ_UNBOUND)) {
+ struct global_cwq *gcwq = get_work_gcwq(work);
+
+ if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+ lcpu = gcwq->cpu;
+ else
+ lcpu = raw_smp_processor_id();
+ } else
+ lcpu = WORK_CPU_UNBOUND;
+
+ set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
timer->expires = jiffies + delay;
timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
@@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-static void run_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
{
- spin_lock_irq(&cwq->lock);
- while (!list_empty(&cwq->worklist)) {
- struct work_struct *work = list_entry(cwq->worklist.next,
- struct work_struct, entry);
- work_func_t f = work->func;
-#ifdef CONFIG_LOCKDEP
+ struct global_cwq *gcwq = worker->gcwq;
+
+ BUG_ON(worker->flags & WORKER_IDLE);
+ BUG_ON(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev));
+
+ /* can't use worker_set_flags(), also called from start_worker() */
+ worker->flags |= WORKER_IDLE;
+ gcwq->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &gcwq->idle_list);
+
+ if (likely(!(worker->flags & WORKER_ROGUE))) {
+ if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+ mod_timer(&gcwq->idle_timer,
+ jiffies + IDLE_WORKER_TIMEOUT);
+ } else
+ wake_up_all(&gcwq->trustee_wait);
+
+ /* sanity check nr_running */
+ WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+ atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+
+ BUG_ON(!(worker->flags & WORKER_IDLE));
+ worker_clr_flags(worker, WORKER_IDLE);
+ gcwq->nr_idle--;
+ list_del_init(&worker->entry);
+}
+
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online. kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep. Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ struct task_struct *task = worker->task;
+
+ while (true) {
/*
- * It is permissible to free the struct work_struct
- * from inside the function that is called from it,
- * this we need to take into account for lockdep too.
- * To avoid bogus "held lock freed" warnings as well
- * as problems when looking into work->lockdep_map,
- * make a copy and use that here.
+ * The following call may fail, succeed or succeed
+ * without actually migrating the task to the cpu if
+ * it races with cpu hotunplug operation. Verify
+ * against GCWQ_DISASSOCIATED.
*/
- struct lockdep_map lockdep_map = work->lockdep_map;
-#endif
- trace_workqueue_execution(cwq->thread, work);
- debug_work_deactivate(work);
- cwq->current_work = work;
- list_del_init(cwq->worklist.next);
- spin_unlock_irq(&cwq->lock);
-
- BUG_ON(get_wq_data(work) != cwq);
- work_clear_pending(work);
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_acquire(&lockdep_map);
- f(work);
- lock_map_release(&lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
-
- if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
- printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(),
- task_pid_nr(current));
- printk(KERN_ERR " last function: ");
- print_symbol("%s\n", (unsigned long)f);
- debug_show_held_locks(current);
- dump_stack();
+ if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+ set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+
+ spin_lock_irq(&gcwq->lock);
+ if (gcwq->flags & GCWQ_DISASSOCIATED)
+ return false;
+ if (task_cpu(task) == gcwq->cpu &&
+ cpumask_equal(&current->cpus_allowed,
+ get_cpu_mask(gcwq->cpu)))
+ return true;
+ spin_unlock_irq(&gcwq->lock);
+
+ /* CPU has come up inbetween, retry migration */
+ cpu_relax();
+ }
+}
+
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online. This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
+{
+ struct worker *worker = container_of(work, struct worker, rebind_work);
+ struct global_cwq *gcwq = worker->gcwq;
+
+ if (worker_maybe_bind_and_lock(worker))
+ worker_clr_flags(worker, WORKER_REBIND);
+
+ spin_unlock_irq(&gcwq->lock);
+}
+
+static struct worker *alloc_worker(void)
+{
+ struct worker *worker;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (worker) {
+ INIT_LIST_HEAD(&worker->entry);
+ INIT_LIST_HEAD(&worker->scheduled);
+ INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+ /* on creation a worker is in !idle && prep state */
+ worker->flags = WORKER_PREP;
+ }
+ return worker;
+}
+
+/**
+ * create_worker - create a new workqueue worker
+ * @gcwq: gcwq the new worker will belong to
+ * @bind: whether to set affinity to @cpu or not
+ *
+ * Create a new worker which is bound to @gcwq. The returned worker
+ * can be started by calling start_worker() or destroyed using
+ * destroy_worker().
+ *
+ * CONTEXT:
+ * Might sleep. Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * Pointer to the newly created worker.
+ */
+static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+{
+ bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+ struct worker *worker = NULL;
+ int id = -1;
+
+ spin_lock_irq(&gcwq->lock);
+ while (ida_get_new(&gcwq->worker_ida, &id)) {
+ spin_unlock_irq(&gcwq->lock);
+ if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+ goto fail;
+ spin_lock_irq(&gcwq->lock);
+ }
+ spin_unlock_irq(&gcwq->lock);
+
+ worker = alloc_worker();
+ if (!worker)
+ goto fail;
+
+ worker->gcwq = gcwq;
+ worker->id = id;
+
+ if (!on_unbound_cpu)
+ worker->task = kthread_create(worker_thread, worker,
+ "kworker/%u:%d", gcwq->cpu, id);
+ else
+ worker->task = kthread_create(worker_thread, worker,
+ "kworker/u:%d", id);
+ if (IS_ERR(worker->task))
+ goto fail;
+
+ /*
+ * A rogue worker will become a regular one if CPU comes
+ * online later on. Make sure every worker has
+ * PF_THREAD_BOUND set.
+ */
+ if (bind && !on_unbound_cpu)
+ kthread_bind(worker->task, gcwq->cpu);
+ else {
+ worker->task->flags |= PF_THREAD_BOUND;
+ if (on_unbound_cpu)
+ worker->flags |= WORKER_UNBOUND;
+ }
+
+ return worker;
+fail:
+ if (id >= 0) {
+ spin_lock_irq(&gcwq->lock);
+ ida_remove(&gcwq->worker_ida, id);
+ spin_unlock_irq(&gcwq->lock);
+ }
+ kfree(worker);
+ return NULL;
+}
+
+/**
+ * start_worker - start a newly created worker
+ * @worker: worker to start
+ *
+ * Make the gcwq aware of @worker and start it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void start_worker(struct worker *worker)
+{
+ worker->flags |= WORKER_STARTED;
+ worker->gcwq->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+}
+
+/**
+ * destroy_worker - destroy a workqueue worker
+ * @worker: worker to be destroyed
+ *
+ * Destroy @worker and adjust @gcwq stats accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void destroy_worker(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ int id = worker->id;
+
+ /* sanity check frenzy */
+ BUG_ON(worker->current_work);
+ BUG_ON(!list_empty(&worker->scheduled));
+
+ if (worker->flags & WORKER_STARTED)
+ gcwq->nr_workers--;
+ if (worker->flags & WORKER_IDLE)
+ gcwq->nr_idle--;
+
+ list_del_init(&worker->entry);
+ worker->flags |= WORKER_DIE;
+
+ spin_unlock_irq(&gcwq->lock);
+
+ kthread_stop(worker->task);
+ kfree(worker);
+
+ spin_lock_irq(&gcwq->lock);
+ ida_remove(&gcwq->worker_ida, id);
+}
+
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+ struct global_cwq *gcwq = (void *)__gcwq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (too_many_workers(gcwq)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ /* idle_list is kept in LIFO order, check the last one */
+ worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+ if (time_before(jiffies, expires))
+ mod_timer(&gcwq->idle_timer, expires);
+ else {
+ /* it's been idle for too long, wake up manager */
+ gcwq->flags |= GCWQ_MANAGE_WORKERS;
+ wake_up_worker(gcwq);
+ }
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+}
+
+static bool send_mayday(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct workqueue_struct *wq = cwq->wq;
+ unsigned int cpu;
+
+ if (!(wq->flags & WQ_RESCUER))
+ return false;
+
+ /* mayday mayday mayday */
+ cpu = cwq->gcwq->cpu;
+ /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = 0;
+ if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+ wake_up_process(wq->rescuer->task);
+ return true;
+}
+
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+ struct global_cwq *gcwq = (void *)__gcwq;
+ struct work_struct *work;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (need_to_create_worker(gcwq)) {
+ /*
+ * We've been trying to create a new worker but
+ * haven't been successful. We might be hitting an
+ * allocation deadlock. Send distress signals to
+ * rescuers.
+ */
+ list_for_each_entry(work, &gcwq->worklist, entry)
+ send_mayday(work);
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+
+ mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+}
+
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
+ * have at least one idle worker on return from this function. If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations. Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+{
+ if (!need_to_create_worker(gcwq))
+ return false;
+restart:
+ spin_unlock_irq(&gcwq->lock);
+
+ /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+ mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+ while (true) {
+ struct worker *worker;
+
+ worker = create_worker(gcwq, true);
+ if (worker) {
+ del_timer_sync(&gcwq->mayday_timer);
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ BUG_ON(need_to_create_worker(gcwq));
+ return true;
}
- spin_lock_irq(&cwq->lock);
- cwq->current_work = NULL;
+ if (!need_to_create_worker(gcwq))
+ break;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(CREATE_COOLDOWN);
+
+ if (!need_to_create_worker(gcwq))
+ break;
}
- spin_unlock_irq(&cwq->lock);
+
+ del_timer_sync(&gcwq->mayday_timer);
+ spin_lock_irq(&gcwq->lock);
+ if (need_to_create_worker(gcwq))
+ goto restart;
+ return true;
}
-static int worker_thread(void *__cwq)
+/**
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
{
- struct cpu_workqueue_struct *cwq = __cwq;
- DEFINE_WAIT(wait);
+ bool ret = false;
- if (cwq->wq->freezeable)
- set_freezable();
+ while (too_many_workers(gcwq)) {
+ struct worker *worker;
+ unsigned long expires;
- for (;;) {
- prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
- if (!freezing(current) &&
- !kthread_should_stop() &&
- list_empty(&cwq->worklist))
- schedule();
- finish_wait(&cwq->more_work, &wait);
+ worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
- try_to_freeze();
+ if (time_before(jiffies, expires)) {
+ mod_timer(&gcwq->idle_timer, expires);
+ break;
+ }
- if (kthread_should_stop())
+ destroy_worker(worker);
+ ret = true;
+ }
+
+ return ret;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to. At any given time, there can be only zero or one manager per
+ * gcwq. The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return. On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ bool ret = false;
+
+ if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+ return ret;
+
+ gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+ gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+ /*
+ * Destroy and then create so that may_start_working() is true
+ * on return.
+ */
+ ret |= maybe_destroy_workers(gcwq);
+ ret |= maybe_create_worker(gcwq);
+
+ gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+ /*
+ * The trustee might be waiting to take over the manager
+ * position, tell it we're done.
+ */
+ if (unlikely(gcwq->trustee))
+ wake_up_all(&gcwq->trustee_wait);
+
+ return ret;
+}
+
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head. Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work. This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
+{
+ struct work_struct *n;
+
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
break;
+ }
- run_workqueue(cwq);
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *work = list_first_entry(&cwq->delayed_works,
+ struct work_struct, entry);
+ struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+
+ move_linked_works(work, pos, NULL);
+ cwq->nr_active++;
+}
+
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+ /* ignore uncolored works */
+ if (color == WORK_NO_COLOR)
+ return;
+
+ cwq->nr_in_flight[color]--;
+ cwq->nr_active--;
+
+ if (!list_empty(&cwq->delayed_works)) {
+ /* one down, submit a delayed one */
+ if (cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
}
- return 0;
+ /* is flush in progress and are we at the flushing tip? */
+ if (likely(cwq->flush_color != color))
+ return;
+
+ /* are there still in-flight works? */
+ if (cwq->nr_in_flight[color])
+ return;
+
+ /* this cwq is done, clear flush_color */
+ cwq->flush_color = -1;
+
+ /*
+ * If this was the last cwq, wake up the first flusher. It
+ * will handle the rest.
+ */
+ if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+ complete(&cwq->wq->first_flusher->done);
+}
+
+/**
+ * process_one_work - process single work
+ * @worker: self
+ * @work: work to process
+ *
+ * Process @work. This function contains all the logics necessary to
+ * process a single work including synchronization against and
+ * interaction with other workers on the same cpu, queueing and
+ * flushing. As long as context requirement is met, any worker can
+ * call this function to process a work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void process_one_work(struct worker *worker, struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct global_cwq *gcwq = cwq->gcwq;
+ struct hlist_head *bwh = busy_worker_head(gcwq, work);
+ bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
+ work_func_t f = work->func;
+ int work_color;
+ struct worker *collision;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the struct work_struct from
+ * inside the function that is called from it, this we need to
+ * take into account for lockdep too. To avoid bogus "held
+ * lock freed" warnings as well as problems when looking into
+ * work->lockdep_map, make a copy and use that here.
+ */
+ struct lockdep_map lockdep_map = work->lockdep_map;
+#endif
+ /*
+ * A single work shouldn't be executed concurrently by
+ * multiple workers on a single cpu. Check whether anyone is
+ * already processing the work. If so, defer the work to the
+ * currently executing one.
+ */
+ collision = __find_worker_executing_work(gcwq, bwh, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, NULL);
+ return;
+ }
+
+ /* claim and process */
+ debug_work_deactivate(work);
+ hlist_add_head(&worker->hentry, bwh);
+ worker->current_work = work;
+ worker->current_cwq = cwq;
+ work_color = get_work_color(work);
+
+ /* record the current cpu number in the work data and dequeue */
+ set_work_cpu(work, gcwq->cpu);
+ list_del_init(&work->entry);
+
+ /*
+ * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+ * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+ */
+ if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+ struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+ struct work_struct, entry);
+
+ if (!list_empty(&gcwq->worklist) &&
+ get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+ wake_up_worker(gcwq);
+ else
+ gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+ }
+
+ /*
+ * CPU intensive works don't participate in concurrency
+ * management. They're the scheduler's responsibility.
+ */
+ if (unlikely(cpu_intensive))
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+
+ spin_unlock_irq(&gcwq->lock);
+
+ work_clear_pending(work);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire(&lockdep_map);
+ f(work);
+ lock_map_release(&lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
+
+ if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+ printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(), task_pid_nr(current));
+ printk(KERN_ERR " last function: ");
+ print_symbol("%s\n", (unsigned long)f);
+ debug_show_held_locks(current);
+ dump_stack();
+ }
+
+ spin_lock_irq(&gcwq->lock);
+
+ /* clear cpu intensive status */
+ if (unlikely(cpu_intensive))
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
+ /* we're done with it, release */
+ hlist_del_init(&worker->hentry);
+ worker->current_work = NULL;
+ worker->current_cwq = NULL;
+ cwq_dec_nr_in_flight(cwq, work_color);
+}
+
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works. Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
+{
+ while (!list_empty(&worker->scheduled)) {
+ struct work_struct *work = list_first_entry(&worker->scheduled,
+ struct work_struct, entry);
+ process_one_work(worker, work);
+ }
+}
+
+/**
+ * worker_thread - the worker thread function
+ * @__worker: self
+ *
+ * The gcwq worker thread function. There's a single dynamic pool of
+ * these per each cpu. These workers process all works regardless of
+ * their specific target workqueue. The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
+ */
+static int worker_thread(void *__worker)
+{
+ struct worker *worker = __worker;
+ struct global_cwq *gcwq = worker->gcwq;
+
+ /* tell the scheduler that this is a workqueue worker */
+ worker->task->flags |= PF_WQ_WORKER;
+woke_up:
+ spin_lock_irq(&gcwq->lock);
+
+ /* DIE can be set only while we're idle, checking here is enough */
+ if (worker->flags & WORKER_DIE) {
+ spin_unlock_irq(&gcwq->lock);
+ worker->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
+ worker_leave_idle(worker);
+recheck:
+ /* no more worker necessary? */
+ if (!need_more_worker(gcwq))
+ goto sleep;
+
+ /* do we need to manage? */
+ if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+ goto recheck;
+
+ /*
+ * ->scheduled list can only be filled while a worker is
+ * preparing to process a work or actually processing it.
+ * Make sure nobody diddled with it while I was sleeping.
+ */
+ BUG_ON(!list_empty(&worker->scheduled));
+
+ /*
+ * When control reaches this point, we're guaranteed to have
+ * at least one idle worker or that someone else has already
+ * assumed the manager role.
+ */
+ worker_clr_flags(worker, WORKER_PREP);
+
+ do {
+ struct work_struct *work =
+ list_first_entry(&gcwq->worklist,
+ struct work_struct, entry);
+
+ if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+ /* optimization path, not strictly necessary */
+ process_one_work(worker, work);
+ if (unlikely(!list_empty(&worker->scheduled)))
+ process_scheduled_works(worker);
+ } else {
+ move_linked_works(work, &worker->scheduled, NULL);
+ process_scheduled_works(worker);
+ }
+ } while (keep_working(gcwq));
+
+ worker_set_flags(worker, WORKER_PREP, false);
+sleep:
+ if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+ goto recheck;
+
+ /*
+ * gcwq->lock is held and there's no work to process and no
+ * need to manage, sleep. Workers are woken up only while
+ * holding gcwq->lock or from local cpu, so setting the
+ * current state before releasing gcwq->lock is enough to
+ * prevent losing any event.
+ */
+ worker_enter_idle(worker);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&gcwq->lock);
+ schedule();
+ goto woke_up;
+}
+
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function. There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation. This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+ struct workqueue_struct *wq = __wq;
+ struct worker *rescuer = wq->rescuer;
+ struct list_head *scheduled = &rescuer->scheduled;
+ bool is_unbound = wq->flags & WQ_UNBOUND;
+ unsigned int cpu;
+
+ set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop())
+ return 0;
+
+ /*
+ * See whether any cpu is asking for help. Unbounded
+ * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+ */
+ for_each_mayday_cpu(cpu, wq->mayday_mask) {
+ unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+ struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+ struct global_cwq *gcwq = cwq->gcwq;
+ struct work_struct *work, *n;
+
+ __set_current_state(TASK_RUNNING);
+ mayday_clear_cpu(cpu, wq->mayday_mask);
+
+ /* migrate to the target cpu if possible */
+ rescuer->gcwq = gcwq;
+ worker_maybe_bind_and_lock(rescuer);
+
+ /*
+ * Slurp in all works issued via this workqueue and
+ * process'em.
+ */
+ BUG_ON(!list_empty(&rescuer->scheduled));
+ list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+ if (get_work_cwq(work) == cwq)
+ move_linked_works(work, scheduled, &n);
+
+ process_scheduled_works(rescuer);
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ schedule();
+ goto repeat;
}
struct wq_barrier {
@@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work)
complete(&barr->done);
}
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
+ *
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution. Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled. This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
- struct wq_barrier *barr, struct list_head *head)
+ struct wq_barrier *barr,
+ struct work_struct *target, struct worker *worker)
{
+ struct list_head *head;
+ unsigned int linked = 0;
+
/*
- * debugobject calls are safe here even with cwq->lock locked
+ * debugobject calls are safe here even with gcwq->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
*/
INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
- __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
-
+ __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ /*
+ * If @target is currently being executed, schedule the
+ * barrier to the worker; otherwise, put it after @target.
+ */
+ if (worker)
+ head = worker->scheduled.next;
+ else {
+ unsigned long *bits = work_data_bits(target);
+
+ head = target->entry.next;
+ /* there can already be other linked works, inherit and set */
+ linked = *bits & WORK_STRUCT_LINKED;
+ __set_bit(WORK_STRUCT_LINKED_BIT, bits);
+ }
+
debug_work_activate(&barr->work);
- insert_work(cwq, &barr->work, head);
+ insert_work(cwq, &barr->work, head,
+ work_color_to_flags(WORK_NO_COLOR) | linked);
}
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1. If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned. If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color. If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush. %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+ int flush_color, int work_color)
{
- int active = 0;
- struct wq_barrier barr;
+ bool wait = false;
+ unsigned int cpu;
- WARN_ON(cwq->thread == current);
-
- spin_lock_irq(&cwq->lock);
- if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
- insert_wq_barrier(cwq, &barr, &cwq->worklist);
- active = 1;
+ if (flush_color >= 0) {
+ BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+ atomic_set(&wq->nr_cwqs_to_flush, 1);
}
- spin_unlock_irq(&cwq->lock);
- if (active) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct global_cwq *gcwq = cwq->gcwq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (flush_color >= 0) {
+ BUG_ON(cwq->flush_color != -1);
+
+ if (cwq->nr_in_flight[flush_color]) {
+ cwq->flush_color = flush_color;
+ atomic_inc(&wq->nr_cwqs_to_flush);
+ wait = true;
+ }
+ }
+
+ if (work_color >= 0) {
+ BUG_ON(work_color != work_next_color(cwq->work_color));
+ cwq->work_color = work_color;
+ }
+
+ spin_unlock_irq(&gcwq->lock);
}
- return active;
+ if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+ complete(&wq->first_flusher->done);
+
+ return wait;
}
/**
@@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
*
* We sleep until all works which were queued on entry have been handled,
* but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself. Now we just wait for the
- * helper threads to do it.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
- const struct cpumask *cpu_map = wq_cpu_map(wq);
- int cpu;
+ struct wq_flusher this_flusher = {
+ .list = LIST_HEAD_INIT(this_flusher.list),
+ .flush_color = -1,
+ .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ };
+ int next_color;
- might_sleep();
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
- for_each_cpu(cpu, cpu_map)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+ mutex_lock(&wq->flush_mutex);
+
+ /*
+ * Start-to-wait phase
+ */
+ next_color = work_next_color(wq->work_color);
+
+ if (next_color != wq->flush_color) {
+ /*
+ * Color space is not full. The current work_color
+ * becomes our flush_color and work_color is advanced
+ * by one.
+ */
+ BUG_ON(!list_empty(&wq->flusher_overflow));
+ this_flusher.flush_color = wq->work_color;
+ wq->work_color = next_color;
+
+ if (!wq->first_flusher) {
+ /* no flush in progress, become the first flusher */
+ BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+ wq->first_flusher = &this_flusher;
+
+ if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+ wq->work_color)) {
+ /* nothing to flush, done */
+ wq->flush_color = next_color;
+ wq->first_flusher = NULL;
+ goto out_unlock;
+ }
+ } else {
+ /* wait in queue */
+ BUG_ON(wq->flush_color == this_flusher.flush_color);
+ list_add_tail(&this_flusher.list, &wq->flusher_queue);
+ flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ }
+ } else {
+ /*
+ * Oops, color space is full, wait on overflow queue.
+ * The next flush completion will assign us
+ * flush_color and transfer to flusher_queue.
+ */
+ list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+ }
+
+ mutex_unlock(&wq->flush_mutex);
+
+ wait_for_completion(&this_flusher.done);
+
+ /*
+ * Wake-up-and-cascade phase
+ *
+ * First flushers are responsible for cascading flushes and
+ * handling overflow. Non-first flushers can simply return.
+ */
+ if (wq->first_flusher != &this_flusher)
+ return;
+
+ mutex_lock(&wq->flush_mutex);
+
+ /* we might have raced, check again with mutex held */
+ if (wq->first_flusher != &this_flusher)
+ goto out_unlock;
+
+ wq->first_flusher = NULL;
+
+ BUG_ON(!list_empty(&this_flusher.list));
+ BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+ while (true) {
+ struct wq_flusher *next, *tmp;
+
+ /* complete all the flushers sharing the current flush color */
+ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+ if (next->flush_color != wq->flush_color)
+ break;
+ list_del_init(&next->list);
+ complete(&next->done);
+ }
+
+ BUG_ON(!list_empty(&wq->flusher_overflow) &&
+ wq->flush_color != work_next_color(wq->work_color));
+
+ /* this flush_color is finished, advance by one */
+ wq->flush_color = work_next_color(wq->flush_color);
+
+ /* one color has been freed, handle overflow queue */
+ if (!list_empty(&wq->flusher_overflow)) {
+ /*
+ * Assign the same color to all overflowed
+ * flushers, advance work_color and append to
+ * flusher_queue. This is the start-to-wait
+ * phase for these overflowed flushers.
+ */
+ list_for_each_entry(tmp, &wq->flusher_overflow, list)
+ tmp->flush_color = wq->work_color;
+
+ wq->work_color = work_next_color(wq->work_color);
+
+ list_splice_tail_init(&wq->flusher_overflow,
+ &wq->flusher_queue);
+ flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ }
+
+ if (list_empty(&wq->flusher_queue)) {
+ BUG_ON(wq->flush_color != wq->work_color);
+ break;
+ }
+
+ /*
+ * Need to flush more colors. Make the next flusher
+ * the new first flusher and arm cwqs.
+ */
+ BUG_ON(wq->flush_color == wq->work_color);
+ BUG_ON(wq->flush_color != next->flush_color);
+
+ list_del_init(&next->list);
+ wq->first_flusher = next;
+
+ if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+ break;
+
+ /*
+ * Meh... this color is already done, clear first
+ * flusher and repeat cascading.
+ */
+ wq->first_flusher = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&wq->flush_mutex);
}
EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
*/
int flush_work(struct work_struct *work)
{
+ struct worker *worker = NULL;
+ struct global_cwq *gcwq;
struct cpu_workqueue_struct *cwq;
- struct list_head *prev;
struct wq_barrier barr;
might_sleep();
- cwq = get_wq_data(work);
- if (!cwq)
+ gcwq = get_work_gcwq(work);
+ if (!gcwq)
return 0;
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
-
- prev = NULL;
- spin_lock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
if (!list_empty(&work->entry)) {
/*
* See the comment near try_to_grab_pending()->smp_rmb().
- * If it was re-queued under us we are not going to wait.
+ * If it was re-queued to a different gcwq under us, we
+ * are not going to wait.
*/
smp_rmb();
- if (unlikely(cwq != get_wq_data(work)))
- goto out;
- prev = &work->entry;
+ cwq = get_work_cwq(work);
+ if (unlikely(!cwq || gcwq != cwq->gcwq))
+ goto already_gone;
} else {
- if (cwq->current_work != work)
- goto out;
- prev = &cwq->worklist;
+ worker = find_worker_executing_work(gcwq, work);
+ if (!worker)
+ goto already_gone;
+ cwq = worker->current_cwq;
}
- insert_wq_barrier(cwq, &barr, prev->next);
-out:
- spin_unlock_irq(&cwq->lock);
- if (!prev)
- return 0;
+
+ insert_wq_barrier(cwq, &barr, work, worker);
+ spin_unlock_irq(&gcwq->lock);
+
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return 1;
+already_gone:
+ spin_unlock_irq(&gcwq->lock);
+ return 0;
}
EXPORT_SYMBOL_GPL(flush_work);
@@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work);
*/
static int try_to_grab_pending(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
+ struct global_cwq *gcwq;
int ret = -1;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
return 0;
/*
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
*/
-
- cwq = get_wq_data(work);
- if (!cwq)
+ gcwq = get_work_gcwq(work);
+ if (!gcwq)
return ret;
- spin_lock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
if (!list_empty(&work->entry)) {
/*
- * This work is queued, but perhaps we locked the wrong cwq.
+ * This work is queued, but perhaps we locked the wrong gcwq.
* In that case we must see the new value after rmb(), see
* insert_work()->wmb().
*/
smp_rmb();
- if (cwq == get_wq_data(work)) {
+ if (gcwq == get_work_gcwq(work)) {
debug_work_deactivate(work);
list_del_init(&work->entry);
+ cwq_dec_nr_in_flight(get_work_cwq(work),
+ get_work_color(work));
ret = 1;
}
}
- spin_unlock_irq(&cwq->lock);
+ spin_unlock_irq(&gcwq->lock);
return ret;
}
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work)
+static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
{
struct wq_barrier barr;
- int running = 0;
+ struct worker *worker;
- spin_lock_irq(&cwq->lock);
- if (unlikely(cwq->current_work == work)) {
- insert_wq_barrier(cwq, &barr, cwq->worklist.next);
- running = 1;
- }
- spin_unlock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
+
+ worker = find_worker_executing_work(gcwq, work);
+ if (unlikely(worker))
+ insert_wq_barrier(worker->current_cwq, &barr, work, worker);
- if (unlikely(running)) {
+ spin_unlock_irq(&gcwq->lock);
+
+ if (unlikely(worker)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
}
@@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
static void wait_on_work(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
- struct workqueue_struct *wq;
- const struct cpumask *cpu_map;
int cpu;
might_sleep();
@@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work)
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
- cwq = get_wq_data(work);
- if (!cwq)
- return;
-
- wq = cwq->wq;
- cpu_map = wq_cpu_map(wq);
-
- for_each_cpu(cpu, cpu_map)
- wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ for_each_gcwq_cpu(cpu)
+ wait_on_cpu_work(get_gcwq(cpu), work);
}
static int __cancel_work_timer(struct work_struct *work,
@@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work,
wait_on_work(work);
} while (unlikely(ret < 0));
- clear_wq_data(work);
+ clear_work_data(work);
return ret;
}
@@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
-static struct workqueue_struct *keventd_wq __read_mostly;
-
/**
* schedule_work - put work task in global workqueue
* @work: job to be done
@@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
*/
int schedule_work(struct work_struct *work)
{
- return queue_work(keventd_wq, work);
+ return queue_work(system_wq, work);
}
EXPORT_SYMBOL(schedule_work);
@@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work);
*/
int schedule_work_on(int cpu, struct work_struct *work)
{
- return queue_work_on(cpu, keventd_wq, work);
+ return queue_work_on(cpu, system_wq, work);
}
EXPORT_SYMBOL(schedule_work_on);
@@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on);
int schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
- return queue_delayed_work(keventd_wq, dwork, delay);
+ return queue_delayed_work(system_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work);
@@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
void flush_delayed_work(struct delayed_work *dwork)
{
if (del_timer_sync(&dwork->timer)) {
- struct cpu_workqueue_struct *cwq;
- cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
- __queue_work(cwq, &dwork->work);
+ __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
+ &dwork->work);
put_cpu();
}
flush_work(&dwork->work);
@@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work);
int schedule_delayed_work_on(int cpu,
struct delayed_work *dwork, unsigned long delay)
{
- return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+ return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work_on);
@@ -820,8 +2568,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
int schedule_on_each_cpu(work_func_t func)
{
int cpu;
- int orig = -1;
- struct work_struct *works;
+ struct work_struct __percpu *works;
works = alloc_percpu(struct work_struct);
if (!works)
@@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func)
get_online_cpus();
- /*
- * When running in keventd don't schedule a work item on
- * itself. Can just call directly because the work queue is
- * already bound. This also is faster.
- */
- if (current_is_keventd())
- orig = raw_smp_processor_id();
-
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, func);
- if (cpu != orig)
- schedule_work_on(cpu, work);
+ schedule_work_on(cpu, work);
}
- if (orig >= 0)
- func(per_cpu_ptr(works, orig));
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func)
*/
void flush_scheduled_work(void)
{
- flush_workqueue(keventd_wq);
+ flush_workqueue(system_wq);
}
EXPORT_SYMBOL(flush_scheduled_work);
@@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
int keventd_up(void)
{
- return keventd_wq != NULL;
+ return system_wq != NULL;
}
-int current_is_keventd(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq;
- int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
- int ret = 0;
-
- BUG_ON(!keventd_wq);
+ /*
+ * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+ * Make sure that the alignment isn't lower than that of
+ * unsigned long long.
+ */
+ const size_t size = sizeof(struct cpu_workqueue_struct);
+ const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+ __alignof__(unsigned long long));
+#ifdef CONFIG_SMP
+ bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+ bool percpu = false;
+#endif
- cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
- if (current == cwq->thread)
- ret = 1;
+ if (percpu)
+ wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+ else {
+ void *ptr;
- return ret;
+ /*
+ * Allocate enough room to align cwq and put an extra
+ * pointer at the end pointing back to the originally
+ * allocated pointer which will be used for free.
+ */
+ ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+ if (ptr) {
+ wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+ *(void **)(wq->cpu_wq.single + 1) = ptr;
+ }
+ }
+ /* just in case, make sure it's actually aligned */
+ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+ return wq->cpu_wq.v ? 0 : -ENOMEM;
}
-static struct cpu_workqueue_struct *
-init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
+static void free_cwqs(struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
- cwq->wq = wq;
- spin_lock_init(&cwq->lock);
- INIT_LIST_HEAD(&cwq->worklist);
- init_waitqueue_head(&cwq->more_work);
+#ifdef CONFIG_SMP
+ bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+ bool percpu = false;
+#endif
- return cwq;
+ if (percpu)
+ free_percpu(wq->cpu_wq.pcpu);
+ else if (wq->cpu_wq.single) {
+ /* the pointer to free is stored right after the cwq */
+ kfree(*(void **)(wq->cpu_wq.single + 1));
+ }
}
-static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+ const char *name)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- struct workqueue_struct *wq = cwq->wq;
- const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
- struct task_struct *p;
+ int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
- p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
- /*
- * Nobody can add the work_struct to this cwq,
- * if (caller is __create_workqueue)
- * nobody should see this wq
- * else // caller is CPU_UP_PREPARE
- * cpu is not on cpu_online_map
- * so we can abort safely.
- */
- if (IS_ERR(p))
- return PTR_ERR(p);
- if (cwq->wq->rt)
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- cwq->thread = p;
-
- trace_workqueue_creation(cwq->thread, cpu);
+ if (max_active < 1 || max_active > lim)
+ printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+ "is out of range, clamping between %d and %d\n",
+ max_active, name, 1, lim);
- return 0;
+ return clamp_val(max_active, 1, lim);
}
-static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+ unsigned int flags,
+ int max_active,
+ struct lock_class_key *key,
+ const char *lock_name)
{
- struct task_struct *p = cwq->thread;
+ struct workqueue_struct *wq;
+ unsigned int cpu;
- if (p != NULL) {
- if (cpu >= 0)
- kthread_bind(p, cpu);
- wake_up_process(p);
- }
-}
+ /*
+ * Unbound workqueues aren't concurrency managed and should be
+ * dispatched to workers immediately.
+ */
+ if (flags & WQ_UNBOUND)
+ flags |= WQ_HIGHPRI;
-struct workqueue_struct *__create_workqueue_key(const char *name,
- int singlethread,
- int freezeable,
- int rt,
- struct lock_class_key *key,
- const char *lock_name)
-{
- struct workqueue_struct *wq;
- struct cpu_workqueue_struct *cwq;
- int err = 0, cpu;
+ max_active = max_active ?: WQ_DFL_ACTIVE;
+ max_active = wq_clamp_max_active(max_active, flags, name);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
- return NULL;
+ goto err;
- wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
- if (!wq->cpu_wq) {
- kfree(wq);
- return NULL;
- }
+ wq->flags = flags;
+ wq->saved_max_active = max_active;
+ mutex_init(&wq->flush_mutex);
+ atomic_set(&wq->nr_cwqs_to_flush, 0);
+ INIT_LIST_HEAD(&wq->flusher_queue);
+ INIT_LIST_HEAD(&wq->flusher_overflow);
wq->name = name;
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
- wq->singlethread = singlethread;
- wq->freezeable = freezeable;
- wq->rt = rt;
INIT_LIST_HEAD(&wq->list);
- if (singlethread) {
- cwq = init_cpu_workqueue(wq, singlethread_cpu);
- err = create_workqueue_thread(cwq, singlethread_cpu);
- start_workqueue_thread(cwq, -1);
- } else {
- cpu_maps_update_begin();
- /*
- * We must place this wq on list even if the code below fails.
- * cpu_down(cpu) can remove cpu from cpu_populated_map before
- * destroy_workqueue() takes the lock, in that case we leak
- * cwq[cpu]->thread.
- */
- spin_lock(&workqueue_lock);
- list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
- /*
- * We must initialize cwqs for each possible cpu even if we
- * are going to call destroy_workqueue() finally. Otherwise
- * cpu_up() can hit the uninitialized cwq once we drop the
- * lock.
- */
- for_each_possible_cpu(cpu) {
- cwq = init_cpu_workqueue(wq, cpu);
- if (err || !cpu_online(cpu))
- continue;
- err = create_workqueue_thread(cwq, cpu);
- start_workqueue_thread(cwq, cpu);
- }
- cpu_maps_update_done();
+ if (alloc_cwqs(wq) < 0)
+ goto err;
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
+ cwq->gcwq = gcwq;
+ cwq->wq = wq;
+ cwq->flush_color = -1;
+ cwq->max_active = max_active;
+ INIT_LIST_HEAD(&cwq->delayed_works);
}
- if (err) {
- destroy_workqueue(wq);
- wq = NULL;
+ if (flags & WQ_RESCUER) {
+ struct worker *rescuer;
+
+ if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
+ goto err;
+
+ wq->rescuer = rescuer = alloc_worker();
+ if (!rescuer)
+ goto err;
+
+ rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+ if (IS_ERR(rescuer->task))
+ goto err;
+
+ wq->rescuer = rescuer;
+ rescuer->task->flags |= PF_THREAD_BOUND;
+ wake_up_process(rescuer->task);
}
- return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
/*
- * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
- * cpu_add_remove_lock protects cwq->thread.
+ * workqueue_lock protects global freeze state and workqueues
+ * list. Grab it, set max_active accordingly and add the new
+ * workqueue to workqueues list.
*/
- if (cwq->thread == NULL)
- return;
+ spin_lock(&workqueue_lock);
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+ for_each_cwq_cpu(cpu, wq)
+ get_cwq(cpu, wq)->max_active = 0;
- flush_cpu_workqueue(cwq);
- /*
- * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
- * a concurrent flush_workqueue() can insert a barrier after us.
- * However, in that case run_workqueue() won't return and check
- * kthread_should_stop() until it flushes all work_struct's.
- * When ->worklist becomes empty it is safe to exit because no
- * more work_structs can be queued on this cwq: flush_workqueue
- * checks list_empty(), and a "normal" queue_work() can't use
- * a dead CPU.
- */
- trace_workqueue_destruction(cwq->thread);
- kthread_stop(cwq->thread);
- cwq->thread = NULL;
+ list_add(&wq->list, &workqueues);
+
+ spin_unlock(&workqueue_lock);
+
+ return wq;
+err:
+ if (wq) {
+ free_cwqs(wq);
+ free_mayday_mask(wq->mayday_mask);
+ kfree(wq->rescuer);
+ kfree(wq);
+ }
+ return NULL;
}
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
/**
* destroy_workqueue - safely terminate a workqueue
@@ -1086,71 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- const struct cpumask *cpu_map = wq_cpu_map(wq);
- int cpu;
+ unsigned int cpu;
+
+ flush_workqueue(wq);
- cpu_maps_update_begin();
+ /*
+ * wq list is used to freeze wq, remove from list after
+ * flushing is complete in case freeze races us.
+ */
spin_lock(&workqueue_lock);
list_del(&wq->list);
spin_unlock(&workqueue_lock);
- for_each_cpu(cpu, cpu_map)
- cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
- cpu_maps_update_done();
+ /* sanity check */
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++)
+ BUG_ON(cwq->nr_in_flight[i]);
+ BUG_ON(cwq->nr_active);
+ BUG_ON(!list_empty(&cwq->delayed_works));
+ }
+
+ if (wq->flags & WQ_RESCUER) {
+ kthread_stop(wq->rescuer->task);
+ free_mayday_mask(wq->mayday_mask);
+ }
- free_percpu(wq->cpu_wq);
+ free_cwqs(wq);
kfree(wq);
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+ unsigned int cpu;
+
+ max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
+
+ spin_lock(&workqueue_lock);
+
+ wq->saved_max_active = max_active;
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (!(wq->flags & WQ_FREEZEABLE) ||
+ !(gcwq->flags & GCWQ_FREEZING))
+ get_cwq(gcwq->cpu, wq)->max_active = max_active;
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested. There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_work_gcwq(work);
+
+ return gcwq ? gcwq->cpu : WORK_CPU_NONE;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running. There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_work_gcwq(work);
+ unsigned long flags;
+ unsigned int ret = 0;
+
+ if (!gcwq)
+ return false;
+
+ spin_lock_irqsave(&gcwq->lock, flags);
+
+ if (work_pending(work))
+ ret |= WORK_BUSY_PENDING;
+ if (find_worker_executing_work(gcwq, work))
+ ret |= WORK_BUSY_RUNNING;
+
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug. Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths. Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online. A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START Command state used on startup. On CPU_DOWN_PREPARE, a
+ * new trustee is started with this state.
+ *
+ * IN_CHARGE Once started, trustee will enter this state after
+ * assuming the manager role and making all existing
+ * workers rogue. DOWN_PREPARE waits for trustee to
+ * enter this state. After reaching IN_CHARGE, trustee
+ * tries to execute the pending worklist until it's empty
+ * and the state is set to BUTCHER, or the state is set
+ * to RELEASE.
+ *
+ * BUTCHER Command state which is set by the cpu callback after
+ * the cpu has went down. Once this state is set trustee
+ * knows that there will be no new works on the worklist
+ * and once the worklist is empty it can proceed to
+ * killing idle workers.
+ *
+ * RELEASE Command state which is set by the cpu callback if the
+ * cpu down has been canceled or it has come online
+ * again. After recognizing this state, trustee stops
+ * trying to drain or butcher and clears ROGUE, rebinds
+ * all remaining workers back to the cpu and releases
+ * manager role.
+ *
+ * DONE Trustee will enter this state after BUTCHER or RELEASE
+ * is complete.
+ *
+ * trustee CPU draining
+ * took over down complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ * | | ^
+ * | CPU is back online v return workers |
+ * ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use. Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({ \
+ long __ret = (timeout); \
+ while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
+ __ret) { \
+ spin_unlock_irq(&gcwq->lock); \
+ __wait_event_timeout(gcwq->trustee_wait, (cond) || \
+ (gcwq->trustee_state == TRUSTEE_RELEASE), \
+ __ret); \
+ spin_lock_irq(&gcwq->lock); \
+ } \
+ gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use. Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({ \
+ long __ret1; \
+ __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+ __ret1 < 0 ? -1 : 0; \
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+ struct global_cwq *gcwq = __gcwq;
+ struct worker *worker;
+ struct work_struct *work;
+ struct hlist_node *pos;
+ long rc;
+ int i;
+
+ BUG_ON(gcwq->cpu != smp_processor_id());
+
+ spin_lock_irq(&gcwq->lock);
+ /*
+ * Claim the manager position and make all workers rogue.
+ * Trustee must be bound to the target cpu and can't be
+ * cancelled.
+ */
+ BUG_ON(gcwq->cpu != smp_processor_id());
+ rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+ BUG_ON(rc < 0);
+
+ gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+ list_for_each_entry(worker, &gcwq->idle_list, entry)
+ worker->flags |= WORKER_ROGUE;
+
+ for_each_busy_worker(worker, i, pos, gcwq)
+ worker->flags |= WORKER_ROGUE;
+
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the rogue flag. This is
+ * necessary as scheduler callbacks may be invoked from other
+ * cpus.
+ */
+ spin_unlock_irq(&gcwq->lock);
+ schedule();
+ spin_lock_irq(&gcwq->lock);
+
+ /*
+ * Sched callbacks are disabled now. Zap nr_running. After
+ * this, nr_running stays zero and need_more_worker() and
+ * keep_working() are always true as long as the worklist is
+ * not empty.
+ */
+ atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+
+ spin_unlock_irq(&gcwq->lock);
+ del_timer_sync(&gcwq->idle_timer);
+ spin_lock_irq(&gcwq->lock);
+
+ /*
+ * We're now in charge. Notify and proceed to drain. We need
+ * to keep the gcwq running during the whole CPU down
+ * procedure as other cpu hotunplug callbacks may need to
+ * flush currently running tasks.
+ */
+ gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+ wake_up_all(&gcwq->trustee_wait);
+
+ /*
+ * The original cpu is in the process of dying and may go away
+ * anytime now. When that happens, we and all workers would
+ * be migrated to other cpus. Try draining any left work. We
+ * want to get it over with ASAP - spam rescuers, wake up as
+ * many idlers as necessary and create new ones till the
+ * worklist is empty. Note that if the gcwq is frozen, there
+ * may be frozen works in freezeable cwqs. Don't declare
+ * completion while frozen.
+ */
+ while (gcwq->nr_workers != gcwq->nr_idle ||
+ gcwq->flags & GCWQ_FREEZING ||
+ gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+ int nr_works = 0;
+
+ list_for_each_entry(work, &gcwq->worklist, entry) {
+ send_mayday(work);
+ nr_works++;
+ }
+
+ list_for_each_entry(worker, &gcwq->idle_list, entry) {
+ if (!nr_works--)
+ break;
+ wake_up_process(worker->task);
+ }
+
+ if (need_to_create_worker(gcwq)) {
+ spin_unlock_irq(&gcwq->lock);
+ worker = create_worker(gcwq, false);
+ spin_lock_irq(&gcwq->lock);
+ if (worker) {
+ worker->flags |= WORKER_ROGUE;
+ start_worker(worker);
+ }
+ }
+
+ /* give a breather */
+ if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+ break;
+ }
+
+ /*
+ * Either all works have been scheduled and cpu is down, or
+ * cpu down has already been canceled. Wait for and butcher
+ * all workers till we're canceled.
+ */
+ do {
+ rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+ while (!list_empty(&gcwq->idle_list))
+ destroy_worker(list_first_entry(&gcwq->idle_list,
+ struct worker, entry));
+ } while (gcwq->nr_workers && rc >= 0);
+
+ /*
+ * At this point, either draining has completed and no worker
+ * is left, or cpu down has been canceled or the cpu is being
+ * brought back up. There shouldn't be any idle one left.
+ * Tell the remaining busy ones to rebind once it finishes the
+ * currently scheduled works by scheduling the rebind_work.
+ */
+ WARN_ON(!list_empty(&gcwq->idle_list));
+
+ for_each_busy_worker(worker, i, pos, gcwq) {
+ struct work_struct *rebind_work = &worker->rebind_work;
+
+ /*
+ * Rebind_work may race with future cpu hotplug
+ * operations. Use a separate flag to mark that
+ * rebinding is scheduled.
+ */
+ worker->flags |= WORKER_REBIND;
+ worker->flags &= ~WORKER_ROGUE;
+
+ /* queue rebind_work, wq doesn't matter, use the default one */
+ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+ work_data_bits(rebind_work)))
+ continue;
+
+ debug_work_activate(rebind_work);
+ insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+ worker->scheduled.next,
+ work_color_to_flags(WORK_NO_COLOR));
+ }
+
+ /* relinquish manager role */
+ gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+ /* notify completion */
+ gcwq->trustee = NULL;
+ gcwq->trustee_state = TRUSTEE_DONE;
+ wake_up_all(&gcwq->trustee_wait);
+ spin_unlock_irq(&gcwq->lock);
+ return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state. DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+{
+ if (!(gcwq->trustee_state == state ||
+ gcwq->trustee_state == TRUSTEE_DONE)) {
+ spin_unlock_irq(&gcwq->lock);
+ __wait_event(gcwq->trustee_wait,
+ gcwq->trustee_state == state ||
+ gcwq->trustee_state == TRUSTEE_DONE);
+ spin_lock_irq(&gcwq->lock);
+ }
+}
+
static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct cpu_workqueue_struct *cwq;
- struct workqueue_struct *wq;
- int ret = NOTIFY_OK;
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct task_struct *new_trustee = NULL;
+ struct worker *uninitialized_var(new_worker);
+ unsigned long flags;
action &= ~CPU_TASKS_FROZEN;
switch (action) {
+ case CPU_DOWN_PREPARE:
+ new_trustee = kthread_create(trustee_thread, gcwq,
+ "workqueue_trustee/%d\n", cpu);
+ if (IS_ERR(new_trustee))
+ return notifier_from_errno(PTR_ERR(new_trustee));
+ kthread_bind(new_trustee, cpu);
+ /* fall through */
case CPU_UP_PREPARE:
- cpumask_set_cpu(cpu, cpu_populated_map);
- }
-undo:
- list_for_each_entry(wq, &workqueues, list) {
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- if (!create_workqueue_thread(cwq, cpu))
- break;
- printk(KERN_ERR "workqueue [%s] for %i failed\n",
- wq->name, cpu);
- action = CPU_UP_CANCELED;
- ret = NOTIFY_BAD;
- goto undo;
-
- case CPU_ONLINE:
- start_workqueue_thread(cwq, cpu);
- break;
-
- case CPU_UP_CANCELED:
- start_workqueue_thread(cwq, -1);
- case CPU_POST_DEAD:
- cleanup_workqueue_thread(cwq);
- break;
+ BUG_ON(gcwq->first_idle);
+ new_worker = create_worker(gcwq, false);
+ if (!new_worker) {
+ if (new_trustee)
+ kthread_stop(new_trustee);
+ return NOTIFY_BAD;
}
}
+ /* some are called w/ irq disabled, don't disturb irq status */
+ spin_lock_irqsave(&gcwq->lock, flags);
+
switch (action) {
- case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ /* initialize trustee and tell it to acquire the gcwq */
+ BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+ gcwq->trustee = new_trustee;
+ gcwq->trustee_state = TRUSTEE_START;
+ wake_up_process(gcwq->trustee);
+ wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+ /* fall through */
+ case CPU_UP_PREPARE:
+ BUG_ON(gcwq->first_idle);
+ gcwq->first_idle = new_worker;
+ break;
+
+ case CPU_DYING:
+ /*
+ * Before this, the trustee and all workers except for
+ * the ones which are still executing works from
+ * before the last CPU down must be on the cpu. After
+ * this, they'll all be diasporas.
+ */
+ gcwq->flags |= GCWQ_DISASSOCIATED;
+ break;
+
case CPU_POST_DEAD:
- cpumask_clear_cpu(cpu, cpu_populated_map);
+ gcwq->trustee_state = TRUSTEE_BUTCHER;
+ /* fall through */
+ case CPU_UP_CANCELED:
+ destroy_worker(gcwq->first_idle);
+ gcwq->first_idle = NULL;
+ break;
+
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ gcwq->flags &= ~GCWQ_DISASSOCIATED;
+ if (gcwq->trustee_state != TRUSTEE_DONE) {
+ gcwq->trustee_state = TRUSTEE_RELEASE;
+ wake_up_process(gcwq->trustee);
+ wait_trustee_state(gcwq, TRUSTEE_DONE);
+ }
+
+ /*
+ * Trustee is done and there might be no worker left.
+ * Put the first_idle in and request a real manager to
+ * take a look.
+ */
+ spin_unlock_irq(&gcwq->lock);
+ kthread_bind(gcwq->first_idle->task, cpu);
+ spin_lock_irq(&gcwq->lock);
+ gcwq->flags |= GCWQ_MANAGE_WORKERS;
+ start_worker(gcwq->first_idle);
+ gcwq->first_idle = NULL;
+ break;
}
- return ret;
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+
+ return notifier_from_errno(0);
}
#ifdef CONFIG_SMP
@@ -1200,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
EXPORT_SYMBOL_GPL(work_on_cpu);
#endif /* CONFIG_SMP */
-void __init init_workqueues(void)
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues. After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of gcwq->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void freeze_workqueues_begin(void)
{
- alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
+ unsigned int cpu;
- cpumask_copy(cpu_populated_map, cpu_online_mask);
- singlethread_cpu = cpumask_first(cpu_possible_mask);
- cpu_singlethread_map = cpumask_of(singlethread_cpu);
- hotcpu_notifier(workqueue_cpu_callback, 0);
- keventd_wq = create_workqueue("events");
- BUG_ON(!keventd_wq);
+ spin_lock(&workqueue_lock);
+
+ BUG_ON(workqueue_freezing);
+ workqueue_freezing = true;
+
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct workqueue_struct *wq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ BUG_ON(gcwq->flags & GCWQ_FREEZING);
+ gcwq->flags |= GCWQ_FREEZING;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (cwq && wq->flags & WQ_FREEZEABLE)
+ cwq->max_active = 0;
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ spin_unlock(&workqueue_lock);
+}
+
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete. This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy. %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+ unsigned int cpu;
+ bool busy = false;
+
+ spin_lock(&workqueue_lock);
+
+ BUG_ON(!workqueue_freezing);
+
+ for_each_gcwq_cpu(cpu) {
+ struct workqueue_struct *wq;
+ /*
+ * nr_active is monotonically decreasing. It's safe
+ * to peek without lock.
+ */
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ continue;
+
+ BUG_ON(cwq->nr_active < 0);
+ if (cwq->nr_active) {
+ busy = true;
+ goto out_unlock;
+ }
+ }
+ }
+out_unlock:
+ spin_unlock(&workqueue_lock);
+ return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues. Normal queueing is restored and all collected
+ * frozen works are transferred to their respective gcwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void thaw_workqueues(void)
+{
+ unsigned int cpu;
+
+ spin_lock(&workqueue_lock);
+
+ if (!workqueue_freezing)
+ goto out_unlock;
+
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct workqueue_struct *wq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+ gcwq->flags &= ~GCWQ_FREEZING;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ continue;
+
+ /* restore max_active and repopulate worklist */
+ cwq->max_active = wq->saved_max_active;
+
+ while (!list_empty(&cwq->delayed_works) &&
+ cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
+ }
+
+ wake_up_worker(gcwq);
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ workqueue_freezing = false;
+out_unlock:
+ spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+
+static int __init init_workqueues(void)
+{
+ unsigned int cpu;
+ int i;
+
+ cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+
+ /* initialize gcwqs */
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ spin_lock_init(&gcwq->lock);
+ INIT_LIST_HEAD(&gcwq->worklist);
+ gcwq->cpu = cpu;
+ if (cpu == WORK_CPU_UNBOUND)
+ gcwq->flags |= GCWQ_DISASSOCIATED;
+
+ INIT_LIST_HEAD(&gcwq->idle_list);
+ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
+
+ init_timer_deferrable(&gcwq->idle_timer);
+ gcwq->idle_timer.function = idle_worker_timeout;
+ gcwq->idle_timer.data = (unsigned long)gcwq;
+
+ setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+ (unsigned long)gcwq);
+
+ ida_init(&gcwq->worker_ida);
+
+ gcwq->trustee_state = TRUSTEE_DONE;
+ init_waitqueue_head(&gcwq->trustee_wait);
+ }
+
+ /* create the initial worker */
+ for_each_online_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct worker *worker;
+
+ worker = create_worker(gcwq, true);
+ BUG_ON(!worker);
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ system_wq = alloc_workqueue("events", 0, 0);
+ system_long_wq = alloc_workqueue("events_long", 0, 0);
+ system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+ WQ_UNBOUND_MAX_ACTIVE);
+ BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+ return 0;
}
+early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue. Only to be
+ * included from sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+ unsigned int cpu);